arm_compute v18.11
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index 3c45ab3..70235a2 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp
@@ -53,7 +53,10 @@
// the kernel to write back output values.
// As the relation between input and output is transposed window.y() is
// used for x anchor and window.x() for y anchor.
- anchor.set(0, std::max<int>(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x);
+ if(_info->dimension(0) > 1)
+ {
+ anchor.set(0, std::max<int>(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x);
+ }
anchor.set(1, std::max<int>(window.x().start() * _scale_y, anchor[0] + border_size.left) + _y);
// End of the valid region is equal to the start of the last write of the
@@ -66,8 +69,11 @@
// a size of the region.
// As the relation between input and output is transposed window.y() is
// used for x shape and window.x() for y shape.
- shape.set(0, std::min<int>((old_anchor[1] + old_shape[1]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
- shape.set(1, std::min<int>((old_anchor[0] + old_shape[0]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
+ if(_info->dimension(0) > 1)
+ {
+ shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
+ }
+ shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
// For higher dimensions use the intersection of the window size and the
// valid region of the input
@@ -192,9 +198,9 @@
ARM_COMPUTE_ERROR_ON(window.x().step() == 0);
const int min_x = window.y().start() * _scale_x + _x;
- const int max_x = window.y().end() * _scale_x + _x;
+ const int max_x = (window.y().end() - window.y().step()) * _scale_x + _x + _width;
const int min_y = window.x().start() * _scale_y + _y;
- const int max_y = window.x().end() * _scale_y + _y;
+ const int max_y = (window.x().end() - window.x().step()) * _scale_y + _y + _height;
const TensorShape &shape = _info->tensor_shape();
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 9703b0f..0947d58 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -64,6 +64,36 @@
}
}
+std::string get_cl_select_type_from_data_type(const DataType &dt)
+{
+ switch(dt)
+ {
+ case DataType::U8:
+ return "uchar";
+ case DataType::S8:
+ return "char";
+ case DataType::QASYMM8:
+ return "uchar";
+ case DataType::U16:
+ return "ushort";
+ case DataType::F16:
+ case DataType::S16:
+ return "short";
+ case DataType::U32:
+ return "uint";
+ case DataType::F32:
+ case DataType::S32:
+ return "int";
+ case DataType::U64:
+ return "ulong";
+ case DataType::S64:
+ return "long";
+ default:
+ ARM_COMPUTE_ERROR("Unsupported input data type.");
+ return "";
+ }
+}
+
std::string get_data_size_from_data_type(const DataType &dt)
{
switch(dt)
@@ -114,7 +144,12 @@
bool dot8_supported(const cl::Device &device)
{
- return device_supports_extension(device, "cl_arm_integer_dot_product_int8");
+ std::string device_name = device.getInfo<CL_DEVICE_NAME>();
+ const GPUTarget gpu_target = get_target_from_name(device_name);
+
+ // SW_WORKAROUND: Workaround for DDK revision r14p0.to enable cl_arm_integer_dot_product_int8
+ std::set<GPUTarget> sw_workaround_issue = {GPUTarget::G76};
+ return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || sw_workaround_issue.count(gpu_target) != 0);
}
bool dot8_acc_supported(const cl::Device &device)
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 3c92257..ff4803e 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -152,19 +152,26 @@
{ "arithmetic_add_quantized", "arithmetic_op_quantized.cl" },
{ "arithmetic_add", "arithmetic_op.cl" },
{ "arithmetic_sub", "arithmetic_op.cl" },
+ { "arithmetic_sub_quantized", "arithmetic_op_quantized.cl" },
{ "arithmetic_div", "arithmetic_op.cl" },
+ { "batch_to_space_nchw", "batch_to_space.cl" },
+ { "batch_to_space_static_nchw", "batch_to_space.cl" },
+ { "batch_to_space_nhwc", "batch_to_space.cl" },
+ { "batch_to_space_static_nhwc", "batch_to_space.cl" },
{ "batchnormalization_layer_nchw", "batchnormalization_layer.cl" },
{ "batchnormalization_layer_nhwc", "batchnormalization_layer.cl" },
{ "bitwise_or", "bitwise_op.cl" },
{ "bitwise_and", "bitwise_op.cl" },
{ "bitwise_xor", "bitwise_op.cl" },
{ "bitwise_not", "bitwise_op.cl" },
+ { "bounding_box_transform", "bounding_box_transform.cl" },
{ "channel_combine_NV", "channel_combine.cl" },
{ "channel_combine_RGB888", "channel_combine.cl" },
{ "channel_combine_RGBA8888", "channel_combine.cl" },
{ "channel_combine_UYVY422", "channel_combine.cl" },
{ "channel_combine_YUYV422", "channel_combine.cl" },
{ "channel_shuffle_nchw", "channel_shuffle.cl" },
+ { "channel_shuffle_nhwc", "channel_shuffle.cl" },
{ "channel_extract_NV12", "channel_extract.cl" },
{ "channel_extract_NV21", "channel_extract.cl" },
{ "channel_extract_RGB888", "channel_extract.cl" },
@@ -175,6 +182,8 @@
{ "combine_gradients_L2", "canny.cl" },
{ "concatenate_depth", "concatenate.cl" },
{ "concatenate_width", "concatenate.cl" },
+ { "concatenate_width_x2", "concatenate.cl" },
+ { "concatenate_width_x4", "concatenate.cl" },
{ "convolution_rectangle", "convolution_rectangle.cl" },
{ "col2im", "col2im.cl" },
{ "convert_depth_down", "depth_convert.cl" },
@@ -191,6 +200,7 @@
{ "convolution_separable1x9_static", "convolution9x9.cl" },
{ "convolution_separable9x1_static", "convolution9x9.cl" },
{ "copy_tensor", "copy_tensor.cl" },
+ { "copy_pad_tensor", "copy_tensor.cl" },
{ "copy_plane", "channel_extract.cl" },
{ "copy_planes_3p", "channel_combine.cl" },
{ "copy_to_keypoint", "fast_corners.cl" },
@@ -230,6 +240,7 @@
{ "fill_image_borders_constant", "fill_border.cl" },
{ "fill_image_borders_replicate", "fill_border.cl" },
{ "finalize", "optical_flow_pyramid_lk.cl" },
+ { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" },
{ "floor_layer", "floor.cl" },
{ "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
{ "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
@@ -240,16 +251,19 @@
{ "gemm_mv", "gemv.cl" },
{ "gemm_mv_quantized", "gemv.cl" },
{ "gemm_mm_interleaved_transposed_f16", "gemm.cl" },
+ { "gemm_mm_interleaved_transposed_f16_acc32", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_f32", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" },
{ "gemm_mm_floating_point", "gemm.cl" },
{ "gemm_mm_floating_point_f16_bifrost", "gemm.cl" },
+ { "gemm_mm_floating_point_f16_bifrost_acc32", "gemm.cl" },
{ "gemm_mm_floating_point_f32_bifrost", "gemm.cl" },
{ "gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl" },
{ "gemm_lc_vm_f32", "gemm.cl" },
{ "gemm_transpose1xW", "gemm.cl" },
{ "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
+ { "gemmlowp_matrix_a_reduction_dot8", "gemmlowp.cl" },
{ "gemmlowp_matrix_b_reduction", "gemmlowp.cl" },
{ "gemmlowp_mm_bifrost", "gemmlowp.cl" },
{ "gemmlowp_mm_bifrost_dot8", "gemmlowp.cl" },
@@ -258,8 +272,12 @@
{ "gemmlowp_mm_interleaved_transposed_bifrost_dot8", "gemmlowp.cl" },
{ "gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl" },
{ "gemmlowp_offset_contribution", "gemmlowp.cl" },
+ { "gemmlowp_offset_contribution_quantize_down", "gemmlowp.cl" },
+ { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "gemmlowp.cl" },
{ "gemmlowp_output_stage_quantize_down", "gemmlowp.cl" },
{ "gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl" },
+ { "gemmlowp_output_stage_quantize_down_float", "gemmlowp.cl" },
+ { "generate_proposals_compute_all_anchors", "generate_proposals.cl" },
{ "harris_score_3x3", "harris_corners.cl" },
{ "harris_score_5x5", "harris_corners.cl" },
{ "harris_score_7x7", "harris_corners.cl" },
@@ -288,11 +306,14 @@
{ "IYUV_to_RGB888_bt709", "color_convert.cl" },
{ "IYUV_to_RGBA8888_bt709", "color_convert.cl" },
{ "IYUV_to_YUV444_bt709", "color_convert.cl" },
- { "l2_normalize", "l2_normalize.cl" },
+ { "l2_normalize_x", "l2_normalize.cl" },
+ { "l2_normalize_y", "l2_normalize.cl" },
+ { "l2_normalize_z", "l2_normalize.cl" },
{ "lktracker_stage0", "optical_flow_pyramid_lk.cl" },
{ "lktracker_stage1", "optical_flow_pyramid_lk.cl" },
{ "magnitude_phase", "magnitude_phase.cl" },
{ "mean_stddev_accumulate", "mean_stddev.cl" },
+ { "memset", "memset.cl" },
{ "minmax", "minmaxloc.cl" },
{ "minmax_border", "minmaxloc.cl" },
{ "minmax_layer", "minmax_layer.cl" },
@@ -306,6 +327,10 @@
{ "non_max_suppression", "nonmax.cl" },
{ "normalization_layer_cross_map", "normalization_layer.cl" },
{ "normalization_layer_in_map", "normalization_layer.cl" },
+ { "normalize_planar_yuv_layer_nchw", "normalize_planar_yuv_layer.cl" },
+ { "normalize_planar_yuv_layer_nhwc", "normalize_planar_yuv_layer.cl" },
+ { "normalize_planar_yuv_layer_q8_nchw", "normalize_planar_yuv_layer_quantized.cl" },
+ { "normalize_planar_yuv_layer_q8_nhwc", "normalize_planar_yuv_layer_quantized.cl" },
{ "NV12_to_IYUV_bt709", "color_convert.cl" },
{ "NV12_to_RGB888_bt709", "color_convert.cl" },
{ "NV12_to_RGBA8888_bt709", "color_convert.cl" },
@@ -320,6 +345,7 @@
{ "permute_3201", "permute.cl" },
{ "pixelwise_mul_float", "pixelwise_mul_float.cl" },
{ "pixelwise_mul_int", "pixelwise_mul_int.cl" },
+ { "pixelwise_mul_quantized", "pixelwise_mul_int.cl" },
{ "pooling_layer_2", "pooling_layer.cl" },
{ "pooling_layer_3", "pooling_layer.cl" },
{ "pooling_layer_optimized_3", "pooling_layer.cl" },
@@ -328,25 +354,37 @@
{ "pooling_layer_MxN_nhwc", "pooling_layer.cl" },
{ "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
{ "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" },
+ { "prior_box_layer_nchw", "prior_box_layer.cl" },
+ { "prior_box_layer_nhwc", "prior_box_layer.cl" },
{ "quantization_layer", "quantization_layer.cl" },
- { "reduction_operation", "reduction_operation.cl" },
+ { "reduction_operation_x", "reduction_operation.cl" },
+ { "reduction_operation_quantized_x", "reduction_operation.cl" },
+ { "reduction_operation_y", "reduction_operation.cl" },
+ { "reduction_operation_z", "reduction_operation.cl" },
+ { "reduction_operation_w", "reduction_operation.cl" },
{ "remap_nearest_neighbour", "remap.cl" },
{ "remap_bilinear", "remap.cl" },
+ { "reorg_layer_nchw", "reorg_layer.cl" },
+ { "reorg_layer_nhwc", "reorg_layer.cl" },
{ "reshape_layer", "reshape_layer.cl" },
{ "reshape_to_columns", "convolution_layer.cl" },
{ "RGB888_to_IYUV_bt709", "color_convert.cl" },
{ "RGB888_to_NV12_bt709", "color_convert.cl" },
{ "RGB888_to_RGBA8888_bt709", "color_convert.cl" },
+ { "RGB888_to_U8_bt709", "color_convert.cl" },
{ "RGB888_to_YUV444_bt709", "color_convert.cl" },
{ "RGBA8888_to_IYUV_bt709", "color_convert.cl" },
{ "RGBA8888_to_NV12_bt709", "color_convert.cl" },
{ "RGBA8888_to_RGB888_bt709", "color_convert.cl" },
{ "RGBA8888_to_YUV444_bt709", "color_convert.cl" },
+ { "roi_align_layer", "roi_align_layer.cl" },
{ "roi_pooling_layer", "roi_pooling_layer.cl" },
{ "scale_nearest_neighbour_nchw", "scale.cl" },
{ "scale_nearest_neighbour_nhwc", "scale.cl" },
{ "scale_bilinear_nchw", "scale.cl" },
{ "scale_bilinear_nhwc", "scale.cl" },
+ { "scale_bilinear_quantized_nchw", "scale_quantized.cl" },
+ { "scale_bilinear_quantized_nhwc", "scale_quantized.cl" },
{ "scharr3x3", "scharr_filter.cl" },
{ "sobel3x3", "sobel_filter.cl" },
{ "sobel_separable5x1", "sobel_filter.cl" },
@@ -358,7 +396,12 @@
{ "softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl" },
{ "softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl" },
{ "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" },
+ { "space_to_batch_nchw", "space_to_batch.cl" },
+ { "space_to_batch_static_nchw", "space_to_batch.cl" },
+ { "space_to_batch_nhwc", "space_to_batch.cl" },
+ { "space_to_batch_static_nhwc", "space_to_batch.cl" },
{ "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
+ { "strided_slice", "slice_ops.cl" },
{ "suppress_non_maximum", "canny.cl" },
{ "tablelookup_U8", "tablelookup.cl" },
{ "tablelookup_S16", "tablelookup.cl" },
@@ -369,6 +412,8 @@
{ "UYVY422_to_NV12_bt709", "color_convert.cl" },
{ "UYVY422_to_RGB888_bt709", "color_convert.cl" },
{ "UYVY422_to_RGBA8888_bt709", "color_convert.cl" },
+ { "upsample_layer_nchw", "upsample_layer.cl" },
+ { "upsample_layer_nhwc", "upsample_layer.cl" },
{ "warp_affine_nearest_neighbour", "warp_affine.cl" },
{ "warp_affine_bilinear", "warp_affine.cl" },
{ "warp_perspective_nearest_neighbour", "warp_perspective.cl" },
@@ -421,6 +466,8 @@
{ "winograd_output_transform_4x4_5x5_nhwc", "winograd_output_transform.cl" },
{ "winograd_output_transform_4x1_5x1_nhwc", "winograd_output_transform.cl" },
{ "winograd_output_transform_1x4_1x5_nhwc", "winograd_output_transform.cl" },
+ { "yolo_layer_nchw", "yolo_layer.cl" },
+ { "yolo_layer_nhwc", "yolo_layer.cl" },
{ "YUYV422_to_IYUV_bt709", "color_convert.cl" },
{ "YUYV422_to_NV12_bt709", "color_convert.cl" },
{ "YUYV422_to_RGB888_bt709", "color_convert.cl" },
@@ -455,10 +502,18 @@
#include "./cl_kernels/arithmetic_op_quantized.clembed"
},
{
+ "batch_to_space.cl",
+#include "./cl_kernels/batch_to_space.clembed"
+ },
+ {
"bitwise_op.cl",
#include "./cl_kernels/bitwise_op.clembed"
},
{
+ "bounding_box_transform.cl",
+#include "./cl_kernels/bounding_box_transform.clembed"
+ },
+ {
"canny.cl",
#include "./cl_kernels/canny.clembed"
},
@@ -519,6 +574,10 @@
#include "./cl_kernels/copy_tensor.clembed"
},
{
+ "upsample_layer.cl",
+#include "./cl_kernels/upsample_layer.clembed"
+ },
+ {
"deconvolution_layer.cl",
#include "./cl_kernels/deconvolution_layer.clembed"
},
@@ -599,6 +658,10 @@
#include "./cl_kernels/gemv.clembed"
},
{
+ "generate_proposals.cl",
+#include "./cl_kernels/generate_proposals.clembed"
+ },
+ {
"harris_corners.cl",
#include "./cl_kernels/harris_corners.clembed"
},
@@ -639,6 +702,10 @@
#include "./cl_kernels/mean_stddev.clembed"
},
{
+ "memset.cl",
+#include "./cl_kernels/memset.clembed"
+ },
+ {
"minmaxloc.cl",
#include "./cl_kernels/minmaxloc.clembed"
},
@@ -667,6 +734,14 @@
#include "./cl_kernels/normalization_layer.clembed"
},
{
+ "normalize_planar_yuv_layer.cl",
+#include "./cl_kernels/normalize_planar_yuv_layer.clembed"
+ },
+ {
+ "normalize_planar_yuv_layer_quantized.cl",
+#include "./cl_kernels/normalize_planar_yuv_layer_quantized.clembed"
+ },
+ {
"batchnormalization_layer.cl",
#include "./cl_kernels/batchnormalization_layer.clembed"
},
@@ -695,6 +770,10 @@
#include "./cl_kernels/pooling_layer_quantized.clembed"
},
{
+ "prior_box_layer.cl",
+#include "./cl_kernels/prior_box_layer.clembed"
+ },
+ {
"quantization_layer.cl",
#include "./cl_kernels/quantization_layer.clembed"
},
@@ -707,10 +786,18 @@
#include "./cl_kernels/remap.clembed"
},
{
+ "reorg_layer.cl",
+#include "./cl_kernels/reorg_layer.clembed"
+ },
+ {
"reshape_layer.cl",
#include "./cl_kernels/reshape_layer.clembed"
},
{
+ "roi_align_layer.cl",
+#include "./cl_kernels/roi_align_layer.clembed"
+ },
+ {
"roi_pooling_layer.cl",
#include "./cl_kernels/roi_pooling_layer.clembed"
},
@@ -719,6 +806,10 @@
#include "./cl_kernels/scale.clembed"
},
{
+ "scale_quantized.cl",
+#include "./cl_kernels/scale_quantized.clembed"
+ },
+ {
"scharr_filter.cl",
#include "./cl_kernels/scharr_filter.clembed"
},
@@ -735,6 +826,14 @@
#include "./cl_kernels/softmax_layer_quantized.clembed"
},
{
+ "slice_ops.cl",
+#include "./cl_kernels/slice_ops.clembed"
+ },
+ {
+ "space_to_batch.cl",
+#include "./cl_kernels/space_to_batch.clembed"
+ },
+ {
"tablelookup.cl",
#include "./cl_kernels/tablelookup.clembed"
},
@@ -774,6 +873,10 @@
"winograd_output_transform.cl",
#include "./cl_kernels/winograd_output_transform.clembed"
},
+ {
+ "yolo_layer.cl",
+#include "./cl_kernels/yolo_layer.clembed"
+ },
#endif /* EMBEDDED_KERNELS */
};
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 491e0c4..995fcb4 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -110,10 +110,12 @@
ARM_COMPUTE_UNUSED(idx_start);
}
+#ifndef DOXYGEN_SKIP_THIS
template void ICLKernel::add_tensor_argument<1>(unsigned &idx, const ICLTensor *tensor, const Window &window);
template void ICLKernel::add_tensor_argument<2>(unsigned &idx, const ICLTensor *tensor, const Window &window);
template void ICLKernel::add_tensor_argument<3>(unsigned &idx, const ICLTensor *tensor, const Window &window);
template void ICLKernel::add_tensor_argument<4>(unsigned &idx, const ICLTensor *tensor, const Window &window);
+#endif /* DOXYGEN_SKIP_THIS */
void ICLKernel::set_target(cl::Device &device)
{
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 486bb6a..6725f36 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -106,6 +106,7 @@
LOAD_FUNCTION_PTR(clReleaseMemObject, handle);
LOAD_FUNCTION_PTR(clGetDeviceInfo, handle);
LOAD_FUNCTION_PTR(clGetDeviceIDs, handle);
+ LOAD_FUNCTION_PTR(clGetMemObjectInfo, handle);
LOAD_FUNCTION_PTR(clRetainEvent, handle);
LOAD_FUNCTION_PTR(clGetPlatformIDs, handle);
LOAD_FUNCTION_PTR(clGetKernelWorkGroupInfo, handle);
@@ -796,6 +797,24 @@
}
}
+cl_int clGetMemObjectInfo(cl_mem memobj,
+ cl_mem_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clGetMemObjectInfo_ptr;
+ if(func != nullptr)
+ {
+ return func(memobj, param_name, param_value_size, param_value, param_value_size_ret);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
+
cl_int clRetainEvent(cl_event event)
{
arm_compute::CLSymbols::get().load_default();
diff --git a/src/core/CL/cl_kernels/activation_helpers.h b/src/core/CL/cl_kernels/activation_helpers.h
new file mode 100644
index 0000000..dfab082
--- /dev/null
+++ b/src/core/CL/cl_kernels/activation_helpers.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(TYPE) && defined(SELECT_TYPE)
+
+#define CONST_ONE 1.f
+#define ABS_OP(a) fabs((a))
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define MLA_OP(a, b, c) ((b) * (c) + (a))
+#define DIV_OP(a, b) ((a) / (b))
+#define EXP_OP(a) exp((a))
+#define LOG_OP(a) log((a))
+#define SQRT_OP(a) sqrt((a))
+#define TANH_OP(a) tanh((a))
+
+// Logistic Activation
+inline TYPE logistic_op(TYPE x)
+{
+ return DIV_OP((TYPE)CONST_ONE, ADD_OP((TYPE)CONST_ONE, EXP_OP(-x)));
+}
+// Hyperbolic Tangent Activation
+inline TYPE tanh_op(TYPE x)
+{
+ return MUL_OP((TYPE)A_VAL, TANH_OP(MUL_OP((TYPE)B_VAL, x)));
+}
+// RELU Tangent Activation
+inline TYPE relu_op(TYPE x)
+{
+ return max((TYPE)0, x);
+}
+// Bounded RELU Activation
+inline TYPE brelu_op(TYPE x)
+{
+ return min((TYPE)A_VAL, max((TYPE)0, x));
+}
+// Lower Upper Bounded RELU Activation
+inline TYPE lu_brelu_op(TYPE x)
+{
+ return min(max(x, (TYPE)B_VAL), (TYPE)A_VAL);
+}
+// Leaky RELU Activation
+inline TYPE lrelu_op(TYPE x)
+{
+ return select(MUL_OP((TYPE)A_VAL, x), x, CONVERT(x > (TYPE)0, SELECT_TYPE));
+}
+// Soft RELU Activation
+inline TYPE srelu_op(TYPE x)
+{
+ return LOG_OP(ADD_OP((TYPE)CONST_ONE, EXP_OP(x)));
+}
+// Absolute Activation
+inline TYPE abs_op(TYPE x)
+{
+ return ABS_OP(x);
+}
+// Square Activation
+inline TYPE square_op(TYPE x)
+{
+ return MUL_OP(x, x);
+}
+// Square-root Activation
+inline TYPE sqrt_op(TYPE x)
+{
+ return SQRT_OP(x);
+}
+// Linear Activation
+inline TYPE linear_op(TYPE x)
+{
+ return MLA_OP((TYPE)B_VAL, (TYPE)A_VAL, x);
+}
+
+#define ACTIVATION_OP2(op, x) op##_op(x)
+#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+
+#endif // defined(TYPE) && defined(SELECT_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
index 373406a..cf1f434 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/activation_layer.cl
@@ -21,80 +21,10 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "helpers.h"
-
#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define SELECT_TYPE VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
-#define CONST_ONE 1.f
-#define ABS_OP(a) fabs((a))
-#define ADD_OP(a, b) ((a) + (b))
-#define SUB_OP(a, b) ((a) - (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define MLA_OP(a, b, c) ((b) * (c) + (a))
-#define DIV_OP(a, b) ((a) / (b))
-#define EXP_OP(a) exp((a))
-#define LOG_OP(a) log((a))
-#define SQRT_OP(a) sqrt((a))
-#define TANH_OP(a) tanh((a))
-
-// Logistic Activation
-inline TYPE logistic_op(TYPE x)
-{
- return DIV_OP((TYPE)CONST_ONE, ADD_OP((TYPE)CONST_ONE, EXP_OP(-x)));
-}
-// Hyperbolic Tangent Activation
-inline TYPE tanh_op(TYPE x)
-{
- return MUL_OP((TYPE)A_VAL, TANH_OP(MUL_OP((TYPE)B_VAL, x)));
-}
-// RELU Tangent Activation
-inline TYPE relu_op(TYPE x)
-{
- return max(0, x);
-}
-// Bounded RELU Activation
-inline TYPE brelu_op(TYPE x)
-{
- return min((TYPE)A_VAL, max(0, x));
-}
-// Lower Upper Bounded RELU Activation
-inline TYPE lu_brelu_op(TYPE x)
-{
- return min(max(x, (TYPE)B_VAL), (TYPE)A_VAL);
-}
-// Leaky RELU Activation
-inline TYPE lrelu_op(TYPE x)
-{
- return select(MUL_OP((TYPE)A_VAL, x), x, x > (TYPE)0);
-}
-// Soft RELU Activation
-inline TYPE srelu_op(TYPE x)
-{
- return LOG_OP(ADD_OP((TYPE)CONST_ONE, EXP_OP(x)));
-}
-// Absolute Activation
-inline TYPE abs_op(TYPE x)
-{
- return ABS_OP(x);
-}
-// Square Activation
-inline TYPE square_op(TYPE x)
-{
- return MUL_OP(x, x);
-}
-// Square-root Activation
-inline TYPE sqrt_op(TYPE x)
-{
- return SQRT_OP(x);
-}
-// Linear Activation
-inline TYPE linear_op(TYPE x)
-{
- return MLA_OP((TYPE)B_VAL, (TYPE)A_VAL, x);
-}
-
-#define ACTIVATION_OP2(op, x) op##_op(x)
-#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+#include "activation_helpers.h"
#if defined(ACT)
diff --git a/src/core/CL/cl_kernels/arithmetic_op.cl b/src/core/CL/cl_kernels/arithmetic_op.cl
index 9efb71b..557615e 100644
--- a/src/core/CL/cl_kernels/arithmetic_op.cl
+++ b/src/core/CL/cl_kernels/arithmetic_op.cl
@@ -33,11 +33,13 @@
#define DIV(x, y) (x) / (y)
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE)
/** This function adds two tensors.
*
* @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
* e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
* @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
*
* @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32
* @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -75,14 +77,16 @@
Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
// Load values
- VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
- in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
- VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
- in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+ VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+ in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+ VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+ in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
// Calculate and store result
- vstore16(ADD(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+ VSTORE(VEC_SIZE)
+ (ADD(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
}
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE) */
/** This function subtracts one tensor from another.
*
diff --git a/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
index 082317b..fc7fa77 100644
--- a/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ b/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
@@ -31,12 +31,27 @@
#define SUB(x, y) (x) - (y)
#endif /* SATURATE */
-#if defined(OFFSET_IN1)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
+
+#if defined(VEC_SIZE)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
/** This function adds two tensors.
*
- * @attention The quantization offset must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
*
* @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8
* @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -73,6 +88,69 @@
Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+ VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT);
+ VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT);
+
+ in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
+ in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
+
+ const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+ const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+
+ const VEC_FLOAT qresf32 = (in1f32 + in2f32) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
+ const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
+
+ // Store result
+ VSTORE(VEC_SIZE)
+ (res, 0, (__global uchar *)out.ptr);
+}
+#endif /* defined(VEC_SIZE) */
+
+/** This function subtracts two tensors.
+ *
+ * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] in2_ptr Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: same as @p in1_ptr
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void arithmetic_sub_quantized(
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out))
+{
+ // Get pixels pointer
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
int16 in_a = CONVERT(vload16(0, (__global uchar *)in1.ptr), int16);
int16 in_b = CONVERT(vload16(0, (__global uchar *)in2.ptr), int16);
@@ -81,10 +159,10 @@
const float16 in1f32 = convert_float16(in_a) * (float16)((float)SCALE_IN1);
const float16 in2f32 = convert_float16(in_b) * (float16)((float)SCALE_IN2);
- const float16 qresf32 = (in1f32 + in2f32) / ((float16)(float)SCALE_OUT) + ((float16)((float16)OFFSET_OUT));
+ const float16 qresf32 = (in1f32 - in2f32) / ((float16)(float)SCALE_OUT) + ((float16)((float16)OFFSET_OUT));
const uchar16 res = convert_uchar16_sat(convert_int16_rte(qresf32));
// Store result
vstore16(res, 0, (__global uchar *)out.ptr);
}
-#endif /* defined(OFFSET) */
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) */
diff --git a/src/core/CL/cl_kernels/batch_to_space.cl b/src/core/CL/cl_kernels/batch_to_space.cl
new file mode 100644
index 0000000..8506fc3
--- /dev/null
+++ b/src/core/CL/cl_kernels/batch_to_space.cl
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software withoutput restriction, including withoutput limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[in] block_shape_ptr Pointer to the source tensor. Supported data types: S32
+ * @param[in] block_shape_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] block_shape_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nchw(
+ TENSOR3D_DECLARATION(input),
+ const int batch_id,
+ VECTOR_DECLARATION(block_shape),
+ TENSOR4D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+ Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+ const int block_x = *((__global int *)vector_offset(&block, 0));
+ const int block_y = *((__global int *)vector_offset(&block, 1));
+
+ const int r = (BATCH_SIZE / (block_x * block_y));
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
+ const int w = batch_id % r;
+
+ const int out_x = x * block_x + (batch_id / r) % block_x;
+ const int out_y = y * block_y + (batch_id / r) / block_x;
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+/** Batch to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[in] block_shape_ptr Pointer to the source tensor. Supported data types: S32
+ * @param[in] block_shape_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] block_shape_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nhwc(
+ TENSOR3D_DECLARATION(input),
+ const int batch_id,
+ VECTOR_DECLARATION(block_shape),
+ TENSOR4D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+ Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+ const int block_x = *((__global int *)vector_offset(&block, 0));
+ const int block_y = *((__global int *)vector_offset(&block, 1));
+
+ const int r = (BATCH_SIZE / (block_x * block_y));
+ const int x = get_global_id(1);
+ const int y = get_global_id(2);
+ const int z = get_global_id(0);
+ const int w = batch_id % r;
+
+ const int out_x = x * block_x + (batch_id / r) % block_x;
+ const int out_y = y * block_y + (batch_id / r) / block_x;
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+/** Batch to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_static_nchw(
+ TENSOR3D_DECLARATION(input),
+ const int batch_id,
+ TENSOR4D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+ const int block_x = BLOCK_SHAPE_X;
+ const int block_y = BLOCK_SHAPE_Y;
+
+ const int r = (BATCH_SIZE / (block_x * block_y));
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
+ const int w = batch_id % r;
+
+ const int out_x = x * block_x + (batch_id / r) % block_x;
+ const int out_y = y * block_y + (batch_id / r) / block_x;
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+/** Batch to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_static_nhwc(
+ TENSOR3D_DECLARATION(input),
+ const int batch_id,
+ TENSOR4D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+ const int block_x = BLOCK_SHAPE_X;
+ const int block_y = BLOCK_SHAPE_Y;
+
+ const int r = (BATCH_SIZE / (block_x * block_y));
+ const int x = get_global_id(1);
+ const int y = get_global_id(2);
+ const int z = get_global_id(0);
+ const int w = batch_id % r;
+
+ const int out_x = x * block_x + (batch_id / r) % block_x;
+ const int out_y = y * block_y + (batch_id / r) / block_x;
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 5352af3..dfd16e0 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -23,14 +23,14 @@
*/
#include "helpers.h"
-#if defined(VEC_SIZE) && defined(DATA_TYPE)
-
#define ADD_OP(a, b) ((a) + (b))
#define SUB_OP(a, b) ((a) - (b))
#define MUL_OP(a, b) ((a) * (b))
#define INVSQRT_OP(a) rsqrt((a))
#define SQCVT_SAT(a) (a)
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
+
#if defined(FUSED_ACTIVATION)
#include "activation_layer.cl"
#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
@@ -258,3 +258,161 @@
(res, 0, (__global DATA_TYPE *)out.ptr);
}
#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) */
+
+#if defined(NUM_CHANNELS) && defined(DATA_TYPE) && defined(EPSILON)
+/** Fuse batchnorm parameters to convolution layer parameters
+ *
+ * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DNUM_CHANNELS=size. e.g. -DNUM_CHANNELS=16
+ * @attention Batch normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in] conv_w_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] conv_w_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] conv_w_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] conv_w_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] conv_w_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] conv_w_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] conv_w_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] conv_w_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] conv_w_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] conv_w_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] bn_mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in] bn_mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] bn_mean_step_x bn_mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bn_mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] bn_var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in] bn_var_stride_x Stride of the var tensor in X dimension (in bytes)
+ * @param[in] bn_var_step_x bn_var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bn_var_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ * @param[out] fused_w_ptr Pointer to the destination weights tensors. Supported data types: same as @p input_ptr
+ * @param[in] fused_w_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] fused_w_step_x fused_w_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] fused_w_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] fused_w_step_y fused_w_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] fused_w_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] fused_w_step_z fused_w_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] fused_w_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] fused_w_step_w fused_w_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] fused_w_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] fused_b_ptr Pointer to the destination bias tensor. Supported data types: same as @p input_ptr
+ * @param[in] fused_b_stride_x Stride of the bias source tensor in X dimension (in bytes)
+ * @param[in] fused_b_step_x fused_b_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] fused_b_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] conv_b_ptr Pointer to the source bias tensor. Supported data types: same as @p input_ptr
+ * @param[in] conv_b_stride_x Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] conv_b_step_x conv_b_beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] conv_b_offset_first_element_in_bytes The offset of the first element in the source bias tensor
+ * @param[in] bn_beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in] bn_beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] bn_beta_step_x bn_beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bn_beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
+ * @param[in] bn_gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in] bn_gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in] bn_gamma_step_x bn_gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bn_gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
+ * @param[in] epsilon Epsilon parameter in the batch normalization equation
+ */
+__kernel void fuse_batchnormalization_layer(TENSOR4D_DECLARATION(conv_w),
+ VECTOR_DECLARATION(bn_mean),
+ VECTOR_DECLARATION(bn_var)
+#ifndef IN_PLACE_W
+ ,
+ TENSOR4D_DECLARATION(fused_w)
+#endif /* not IN_PLACE_W */
+#ifndef IN_PLACE_B
+ ,
+ VECTOR_DECLARATION(fused_b)
+#endif /* not IN_PLACE_B */
+#ifdef HAS_BIAS
+ ,
+ VECTOR_DECLARATION(conv_b)
+#endif /* HAS_BIAS */
+#ifndef USE_DEFAULT_BETA
+ ,
+ VECTOR_DECLARATION(bn_beta)
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ ,
+ VECTOR_DECLARATION(bn_gamma)
+#endif /* USE_DEFAULT_GAMMA */
+ )
+{
+ Tensor4D conv_w = CONVERT_TO_TENSOR4D_STRUCT(conv_w, NUM_CHANNELS);
+ Vector bn_mean = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_mean);
+ Vector bn_var = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_var);
+
+ // In-place ops
+#ifdef IN_PLACE_W
+ Tensor4D fused_w = conv_w;
+#else /* IN_PLACE_W */
+ Tensor4D fused_w = CONVERT_TO_TENSOR4D_STRUCT(fused_w, NUM_CHANNELS);
+#endif /* IN_PLACE */
+#ifdef IN_PLACE_B
+ Vector fused_b = conv_b;
+#else /* IN_PLACE_W */
+ Vector fused_b = CONVERT_TO_VECTOR_STRUCT_NO_STEP(fused_b);
+#endif /* IN_PLACE */
+
+ // Conditional ops
+#ifdef HAS_BIAS
+ Vector conv_b = CONVERT_TO_VECTOR_STRUCT_NO_STEP(conv_b);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_BETA
+ Vector bn_beta = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ Vector bn_gamma = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_gamma);
+#endif /* USE_DEFAULT_GAMMA */
+
+ const int current_slice = get_global_id(2) / NUM_CHANNELS;
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi = (int)(get_global_id(0) * VEC_SIZE);
+ conv_w.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * conv_w_stride_x;
+ fused_w.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * fused_w_stride_x;
+
+ // Load W
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ wn = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)conv_w.ptr);
+#else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+ DATA_TYPE wn = *((__global DATA_TYPE *)(conv_w.ptr));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+
+ // rvar = 1 / sqrt(var + epsilon)
+ const DATA_TYPE var = *((__global DATA_TYPE *)(bn_var.ptr + current_slice * bn_var.stride_x));
+ const DATA_TYPE rvar = INVSQRT_OP(ADD_OP(var, SQCVT_SAT((float)EPSILON)));
+ wn *= rvar;
+
+ // Load b
+ const DATA_TYPE mean = *((__global DATA_TYPE *)(bn_mean.ptr + current_slice * bn_mean.stride_x));
+ DATA_TYPE bn = 0;
+#ifdef HAS_BIAS
+ bn = *((__global DATA_TYPE *)(conv_b.ptr + current_slice * conv_b.stride_x));
+#endif /* HAS_BIAS */
+ bn = (bn - mean) * rvar;
+
+#ifndef USE_DEFAULT_GAMMA
+ const DATA_TYPE gamma_scalar = *((__global DATA_TYPE *)(bn_gamma.ptr + current_slice * bn_gamma.stride_x));
+ wn *= gamma_scalar;
+ bn *= gamma_scalar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+ const DATA_TYPE beta_scalar = *((__global DATA_TYPE *)(bn_beta.ptr + current_slice * bn_beta.stride_x));
+ bn += beta_scalar;
+#endif /* USE_DEFAULT_BETA */
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+ // Store updated weights
+ VSTORE(VEC_SIZE)
+ (wn, 0, (__global DATA_TYPE *)fused_w.ptr);
+#else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+ *((__global DATA_TYPE *)(fused_w.ptr)) = wn;
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+
+ // Store updated bias
+ *((__global DATA_TYPE *)(fused_b.ptr + current_slice * fused_b.stride_x)) = bn;
+}
+#endif /* defined(NUM_CHANNELS) && defined(DATA_TYPE) && defined(EPSILON) */
diff --git a/src/core/CL/cl_kernels/bounding_box_transform.cl b/src/core/CL/cl_kernels/bounding_box_transform.cl
new file mode 100644
index 0000000..0972355
--- /dev/null
+++ b/src/core/CL/cl_kernels/bounding_box_transform.cl
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) // Check for compile time constants
+
+/** Perform a padded copy of input tensor to the output tensor. Padding values are defined at compile time
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
+ * -# -DWEIGHT{X,Y,W,H}= Weights [wx, wy, ww, wh] for the deltas
+ * -# -DIMG_WIDTH= Original image width
+ * -# -DIMG_HEIGHT= Original image height
+ * -# -DBOX_FIELDS= Number of fields that are used to represent a box in boxes
+ *
+ * @param[in] boxes_ptr Pointer to the boxes tensor. Supported data types: F16/F32
+ * @param[in] boxes_stride_x Stride of the boxes tensor in X dimension (in bytes)
+ * @param[in] boxes_step_x boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] boxes_stride_y Stride of the boxes tensor in Y dimension (in bytes)
+ * @param[in] boxes_step_y boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] boxes_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] boxes_step_z boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] boxes_offset_first_element_in_bytes The offset of the first element in the boxes tensor
+ * @param[out] pred_boxes_ptr Pointer to the predicted boxes. Supported data types: same as @p in_ptr
+ * @param[in] pred_boxes_stride_x Stride of the predicted boxes in X dimension (in bytes)
+ * @param[in] pred_boxes_step_x pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] pred_boxes_stride_y Stride of the predicted boxes in Y dimension (in bytes)
+ * @param[in] pred_boxes_step_y pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] pred_boxes_stride_z Stride of the predicted boxes in Z dimension (in bytes)
+ * @param[in] pred_boxes_step_z pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] pred_boxes_offset_first_element_in_bytes The offset of the first element in the predicted boxes
+ * @param[in] deltas_ptr Pointer to the deltas tensor. Supported data types: same as @p in_ptr
+ * @param[in] deltas_stride_x Stride of the deltas tensor in X dimension (in bytes)
+ * @param[in] deltas_step_x deltas_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] deltas_stride_y Stride of the deltas tensor in Y dimension (in bytes)
+ * @param[in] deltas_step_y deltas_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] deltas_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] deltas_step_z deltas_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] deltas_offset_first_element_in_bytes The offset of the first element in the deltas tensor
+ */
+__kernel void bounding_box_transform(
+ VECTOR_DECLARATION(boxes),
+ IMAGE_DECLARATION(pred_boxes),
+ IMAGE_DECLARATION(deltas))
+{
+ // Get pixels pointer
+ Vector boxes = CONVERT_TO_VECTOR_STRUCT_NO_STEP(boxes);
+ Image pred_boxes = CONVERT_TO_IMAGE_STRUCT(pred_boxes);
+ Image deltas = CONVERT_TO_IMAGE_STRUCT(deltas);
+
+ // Load delta and box values into registers
+ const DATA_TYPE one = (DATA_TYPE)1.f;
+ const DATA_TYPE halfone = (DATA_TYPE)0.5f;
+
+ const int py = get_global_id(1); // box
+ const VEC_DATA_TYPE(DATA_TYPE, 4)
+ scale_before = (VEC_DATA_TYPE(DATA_TYPE, 4))SCALE_BEFORE;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ delta = vload4(0, (__global DATA_TYPE *)deltas.ptr);
+ const VEC_DATA_TYPE(DATA_TYPE, 4)
+ box = vload4(0, (__global DATA_TYPE *)vector_offset(&boxes, BOX_FIELDS * py)) / scale_before;
+
+ // Calculate width and centers of the old boxes
+ const VEC_DATA_TYPE(DATA_TYPE, 2)
+ dims = box.s23 - box.s01 + one;
+ const VEC_DATA_TYPE(DATA_TYPE, 2)
+ ctr = box.s01 + halfone * dims;
+ const VEC_DATA_TYPE(DATA_TYPE, 4)
+ weights = (VEC_DATA_TYPE(DATA_TYPE, 4))(WEIGHT_X, WEIGHT_Y, WEIGHT_W, WEIGHT_H);
+ delta /= weights;
+ delta.s23 = min(delta.s23, (DATA_TYPE)BBOX_XFORM_CLIP);
+
+ // Calculate widths and centers of the new boxes (translation + aspect ratio transformation)
+ const VEC_DATA_TYPE(DATA_TYPE, 2)
+ pred_ctr = delta.s01 * dims + ctr;
+ const VEC_DATA_TYPE(DATA_TYPE, 2)
+ pred_dims = exp(delta.s23) * dims;
+
+ // Useful vector constant definitions
+ const VEC_DATA_TYPE(DATA_TYPE, 4)
+ max_values = (VEC_DATA_TYPE(DATA_TYPE, 4))(IMG_WIDTH - 1, IMG_HEIGHT - 1, IMG_WIDTH - 1, IMG_HEIGHT - 1);
+ const VEC_DATA_TYPE(DATA_TYPE, 4)
+ sign = (VEC_DATA_TYPE(DATA_TYPE, 4))(-1, -1, 1, 1);
+ const VEC_DATA_TYPE(DATA_TYPE, 4)
+ min_values = 0;
+
+ // Calculate the coordinates of the new boxes
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ pred_box = pred_ctr.s0101 + sign * halfone * pred_dims.s0101;
+#ifdef OFFSET // Possibly adjust the predicted boxes
+ pred_box.s23 -= one;
+#endif // Possibly adjust the predicted boxes
+ pred_box = CLAMP(pred_box, min_values, max_values);
+#ifdef SCALE_AFTER // Possibly scale the predicted boxes
+ pred_box *= (VEC_DATA_TYPE(DATA_TYPE, 4))SCALE_AFTER;
+#endif // Possibly scale the predicted boxes
+
+ // Store them into the output
+ vstore4(pred_box, 0, (__global DATA_TYPE *)pred_boxes.ptr);
+}
+
+#endif // defined(DATA_TYPE) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE)
diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/channel_shuffle.cl
index 23962e1..3ac67c5 100644
--- a/src/core/CL/cl_kernels/channel_shuffle.cl
+++ b/src/core/CL/cl_kernels/channel_shuffle.cl
@@ -23,19 +23,28 @@
*/
#include "helpers.h"
-#if defined(DATA_TYPE) && defined(BLOCK_SIZE) && defined(NUM_GROUPS) && defined(K)
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
-// Check valid BLOCK_SIZES
-#if BLOCK_SIZE != 4 && BLOCK_SIZE != 8 && BLOCK_SIZE != 16
-#error "Only block sizes 4, 8 and 16 are supported"
-#endif /* BLOCK_SIZE != 4 && BLOCK_SIZE != 8 && BLOCK_SIZE != 16 */
+// Check valid VEC_SIZES
+#if VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-/** Perfoms channel shuffle see https://arxiv.org/pdf/1707.01083.pdf for details.
+#define DIV_MOD_UINT(x, y, div_res, mod_res) \
+ ({ \
+ div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
+ uint r = div_res * (y); \
+ mod_res = (x)-r; \
+ })
+
+/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
*
- * @note The number of groups should be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
- * @note The number of channels in each group should be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
* K is equal to num_channels / num_groups.
*
* @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
@@ -45,6 +54,8 @@
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
@@ -53,80 +64,118 @@
* @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
-__kernel void channel_shuffle_nchw(TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst))
{
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+ uint curr_channel = 0; // channel id of input
+ uint batch_id = 0; // batch id
+ uint group_id = 0; // group id
+ uint channel_id = 0; // channel id within the group
- const uint curr_channel = get_global_id(2); // channel id of input
- const uint group_id = curr_channel / K; // group id
- const uint channel_id = curr_channel % K; // channel id within the group
+ // Compute curr_channel and batch_id
+ DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
- const uint x = get_global_id(0) * BLOCK_SIZE;
- const uint y = get_global_id(1) * BLOCK_SIZE;
+ // Compute group_id and channel_id
+ DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
+
+ const uint x = get_global_id(0) * VEC_SIZE;
+ const uint y = get_global_id(1) * 2;
const uint z = channel_id * NUM_GROUPS + group_id;
- // Load the NxN block
- TYPE u0 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 0, 0));
- TYPE u1 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 1, 0));
- TYPE u2 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 2, 0));
- TYPE u3 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 3, 0));
-#if BLOCK_SIZE > 4
- TYPE u4 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 4, 0));
- TYPE u5 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 5, 0));
- TYPE u6 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 6, 0));
- TYPE u7 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 7, 0));
-#if BLOCK_SIZE == 16
- TYPE u8 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 8, 0));
- TYPE u9 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 9, 0));
- TYPE u10 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 10, 0));
- TYPE u11 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 11, 0));
- TYPE u12 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 12, 0));
- TYPE u13 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 13, 0));
- TYPE u14 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 14, 0));
- TYPE u15 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 15, 0));
-#endif /* BLOCK_SIZE == 16 */
-#endif /* BLOCK_SIZE > 4 */
+ // Load the Nx2 block
+ const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
+ TYPE u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+ TYPE u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
// Store blocks
- VSTORE(BLOCK_SIZE)
- (u0, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 0, z));
- VSTORE(BLOCK_SIZE)
- (u1, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 1, z));
- VSTORE(BLOCK_SIZE)
- (u2, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 2, z));
- VSTORE(BLOCK_SIZE)
- (u3, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 3, z));
-#if BLOCK_SIZE > 4
- VSTORE(BLOCK_SIZE)
- (u4, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 4, z));
- VSTORE(BLOCK_SIZE)
- (u5, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 5, z));
- VSTORE(BLOCK_SIZE)
- (u6, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 6, z));
- VSTORE(BLOCK_SIZE)
- (u7, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 7, z));
-#if BLOCK_SIZE == 16
- VSTORE(BLOCK_SIZE)
- (u8, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 8, z));
- VSTORE(BLOCK_SIZE)
- (u9, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 9, z));
- VSTORE(BLOCK_SIZE)
- (u10, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 10, z));
- VSTORE(BLOCK_SIZE)
- (u11, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 11, z));
- VSTORE(BLOCK_SIZE)
- (u12, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 12, z));
- VSTORE(BLOCK_SIZE)
- (u13, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 13, z));
- VSTORE(BLOCK_SIZE)
- (u14, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 14, z));
- VSTORE(BLOCK_SIZE)
- (u15, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 15, z));
-#endif /* BLOCK_SIZE == 16 */
-#endif /* BLOCK_SIZE > 4 */
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+ VSTORE(VEC_SIZE)
+ (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
+ VSTORE(VEC_SIZE)
+ (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
}
-#endif /* defined(DATA_TYPE) && defined(BLOCK_SIZE) && defined(NUM_GROUPS) && defined(K) */
+
+#if VEC_SIZE == 4 && defined(LAST_ACCESSED)
+/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note This implementation is only defined for VEC_SIZE = 4
+ * @note This last element accessed along the first dimension must be given as a preprocessor argument using -DLAST_ACCESSED=num. e.g. -DLAST_ACCESSED=64 in order to prevent out-of-bound writes.
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The height of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ * K is equal to num_channels / num_groups.
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst))
+{
+ const uint curr_channel = min((uint)(get_global_id(0) * VEC_SIZE), (uint)LAST_ACCESSED); // input feature map
+ uint channel_id0 = 0;
+ uint channel_id1 = 0;
+ uint channel_id2 = 0;
+ uint channel_id3 = 0;
+ uint group_id0 = 0;
+ uint group_id1 = 0;
+ uint group_id2 = 0;
+ uint group_id3 = 0;
+ uint y = 0;
+ uint batch_id = 0;
+
+ // Compute curr_channel and batch_id
+ DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, y);
+
+ // Compute group_id and channel_id
+ DIV_MOD_UINT(curr_channel + (uint)0, K, group_id0, channel_id0);
+ DIV_MOD_UINT(curr_channel + (uint)1, K, group_id1, channel_id1);
+ DIV_MOD_UINT(curr_channel + (uint)2, K, group_id2, channel_id2);
+ DIV_MOD_UINT(curr_channel + (uint)3, K, group_id3, channel_id3);
+
+ const uint x = get_global_id(1) * 2;
+ const uint z0 = channel_id0 * (uint)NUM_GROUPS + group_id0;
+ const uint z1 = channel_id1 * (uint)NUM_GROUPS + group_id1;
+ const uint z2 = channel_id2 * (uint)NUM_GROUPS + group_id2;
+ const uint z3 = channel_id3 * (uint)NUM_GROUPS + group_id3;
+
+ // Load the Nx2 block
+ const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + curr_channel * sizeof(DATA_TYPE) + x * src_stride_y + y * src_stride_z + batch_id * src_stride_w;
+ TYPE u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+ TYPE u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+
+ // Store blocks
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_stride_y + y * dst_stride_z + batch_id * dst_stride_w;
+ *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u0.s0;
+ *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u0.s1;
+ *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u0.s2;
+ *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u0.s3;
+ *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u1.s0;
+ *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u1.s1;
+ *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u1.s2;
+ *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u1.s3;
+}
+#endif // VEC_SIZE == 4 && defined(LAST_ACCESSED)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/col2im.cl
index 5e52127..b02d07b 100644
--- a/src/core/CL/cl_kernels/col2im.cl
+++ b/src/core/CL/cl_kernels/col2im.cl
@@ -23,7 +23,7 @@
*/
#include "helpers.h"
-#if defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT)
+#if defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
#if ELEMENT_SIZE == 1
#define COND_DATA_TYPE char
@@ -41,7 +41,7 @@
* @note The width of the input tensor must be passed at compile time using -DWIDTH_INPUT: e.g. -DWIDTH_INPUT=320
* @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=600
* @note The element size must be passed at compile time using -DELEMENT_SIZE: e.g. -DELEMENT_SIZE=4
- * @note In case of grouping the GROUPING flag must be passed at compile time using -DGROUPING
+ * @note The number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -58,15 +58,16 @@
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void col2im(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
- uint dst_stride_w)
+ TENSOR4D_DECLARATION(dst))
{
Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, 0);
const uint xd = get_global_id(1) % WIDTH_OUTPUT; // x coordinate of the destination tensor
const uint yd = get_global_id(1) / WIDTH_OUTPUT; // y coordinate of the destination tensor
@@ -86,27 +87,25 @@
// If out-of-bound, overwrite with the first element
data = select((VEC_DATA_TYPE(DATA_TYPE, 8))data.s0, data, cond0);
- __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes;
+#if NUM_GROUPS > 1
+ // Compute output offset (batches on 4th dimension)
+ int idx = yd * dst_stride_y + xd * dst_stride_x + (get_global_id(2) / NUM_GROUPS) * dst.stride_w;
-#if defined(GROUPING)
- // Compute output offset (batches on 4th dimension, no need to compute manually)
- int idx = yd * dst_stride_y + xd * dst_stride_x;
-
- const uint group = get_global_id(2); // group ID
+ const uint group = get_global_id(2) % NUM_GROUPS; // group ID
x_clamped += group * WIDTH_INPUT;
-#else /* defined(GROUPING) */
+#else /* defined(NUM_GROUPS > 1 ) */
// Compute output offset (batches on 3rd dimension)
- int idx = yd * dst_stride_y + xd * dst_stride_x + get_global_id(2) * dst_stride_w;
-#endif /* GROUPING */
+ int idx = yd * dst.stride_y + xd * dst.stride_x + get_global_id(2) * dst.stride_w;
+#endif /* NUM_GROUPS > 1 */
// Store value
- *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s0 * dst_stride_z)) = data.s0;
- *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s1 * dst_stride_z)) = data.s1;
- *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s2 * dst_stride_z)) = data.s2;
- *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s3 * dst_stride_z)) = data.s3;
- *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s4 * dst_stride_z)) = data.s4;
- *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s5 * dst_stride_z)) = data.s5;
- *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s6 * dst_stride_z)) = data.s6;
- *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s7 * dst_stride_z)) = data.s7;
+ *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s0 * dst.stride_z)) = data.s0;
+ *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s1 * dst.stride_z)) = data.s1;
+ *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s2 * dst.stride_z)) = data.s2;
+ *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s3 * dst.stride_z)) = data.s3;
+ *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s4 * dst.stride_z)) = data.s4;
+ *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s5 * dst.stride_z)) = data.s5;
+ *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s6 * dst.stride_z)) = data.s6;
+ *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s7 * dst.stride_z)) = data.s7;
}
-#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT)
+#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl
index 02a0c8e..7a872b4 100644
--- a/src/core/CL/cl_kernels/color_convert.cl
+++ b/src/core/CL/cl_kernels/color_convert.cl
@@ -64,6 +64,54 @@
vstore16(rgba_3, 0, out.ptr + 48);
}
+/** Convert an RGB888 image to U8
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported Format: RGB888
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported Format: U8
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void RGB888_to_U8_bt709(
+ IMAGE_DECLARATION(input),
+ IMAGE_DECLARATION(output))
+{
+ Image in = CONVERT_TO_IMAGE_STRUCT(input);
+ Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+ // handle 16 pixels every time
+ const uchar16 rgb_0 = vload16(0, in.ptr);
+ const uchar16 rgb_1 = vload16(0, in.ptr + 16);
+ const uchar16 rgb_2 = vload16(0, in.ptr + 32);
+
+ //Resequence values from a sequence of 16 RGB values to sequence of 16 R, 16 G, 16 B values
+ const uchar16 rgb_r = (uchar16)(rgb_0.s0369, rgb_0.scf, rgb_1.s258b, rgb_1.se, rgb_2.s147a, rgb_2.sd);
+ const uchar16 rgb_g = (uchar16)(rgb_0.s147a, rgb_0.sd, rgb_1.s0369, rgb_1.scf, rgb_2.s258b, rgb_2.se);
+ const uchar16 rgb_b = (uchar16)(rgb_0.s258b, rgb_0.se, rgb_1.s147a, rgb_1.sd, rgb_2.s0369, rgb_2.scf);
+
+ const float16 rgb2u8_red_coef_bt709 = 0.2126f;
+ const float16 rgb2u8_green_coef_bt709 = 0.7152f;
+ const float16 rgb2u8_blue_coef_bt709 = 0.0722f;
+
+ //Computation of 16 greyscale values in float
+ const float16 greyscale_f_0 = rgb2u8_red_coef_bt709 * convert_float16(rgb_r) + rgb2u8_green_coef_bt709 * convert_float16(rgb_g) + rgb2u8_blue_coef_bt709 * convert_float16(rgb_b);
+
+ //Convert it to 16 grayscale uchar values
+ const uchar16 greyscale_u8_0 = convert_uchar16_sat_rtz(greyscale_f_0);
+
+ vstore16(greyscale_u8_0, 0, out.ptr);
+}
+
/** Convert an RGB888 image to RGBX8888
*
* Global Workgroup Size [ DIV_CEIL(width, 16), height ]
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index 16c4363..0e8805f 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -23,12 +23,220 @@
*/
#include "helpers.h"
-#if defined(DATA_TYPE)
-#if defined(WIDTH_OFFSET)
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#if defined(DEPTH) && defined(ELEMENT_SIZE)
+
+#if defined(INPUT1_WIDTH)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not supported"
+#endif // ELEMENT_SIZE
+
+#if VEC_SIZE == 2
+#define SEQ ((int2)(0, 1))
+#elif VEC_SIZE == 4
+#define SEQ ((int4)(0, 1, 2, 3))
+#elif VEC_SIZE == 8
+#define SEQ ((int8)(0, 1, 2, 3, 4, 5, 6, 7))
+#elif VEC_SIZE == 16
+#define SEQ ((int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))
+#else // VEC_SIZE
+#error "Vector size not supported"
+#endif // VEC_SIZE
+/** This kernel concatenates two input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
+ *
+ * @param[in] src1_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in] src1_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src1_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src1_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] src2_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in] src2_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src2_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src2_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src2_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src2_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void concatenate_width_x2(
+ TENSOR4D_DECLARATION(src1),
+ TENSOR4D_DECLARATION(src2),
+ TENSOR4D_DECLARATION(dst))
+{
+ Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+
+ // Calculate input indices
+ const int x = get_global_id(0) * (int)VEC_SIZE;
+ const int y = get_global_id(1);
+ const int z = get_global_id(2) % (int)DEPTH;
+ const int w = get_global_id(2) / (int)DEPTH;
+ const int x1 = min(x, (int)INPUT1_WIDTH);
+ const int x2 = max(x - (int)INPUT1_WIDTH, -(int)VEC_SIZE);
+
+ // Calculate inputs and output addresses
+ const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+ const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+
+ const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
+ const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) values = select(src2_values, src1_values, cond);
+
+ VSTORE(VEC_SIZE)
+ (values, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+#if defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH)
+/** This kernel concatenates four input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
+ * @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
+ * @note Third input tensor width should be given as a preprocessor argument using -DINPUT3_WIDTH=width. e.g. -DINPUT3_WIDTH=8
+ *
+ * @param[in] src1_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in] src1_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src1_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src1_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] src2_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in] src2_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src2_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src2_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src2_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src2_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] src3_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in] src3_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src3_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src3_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src3_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src3_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src3_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src3_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src3_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src3_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] src4_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in] src4_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src4_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src4_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src4_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src4_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src4_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src4_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src4_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src4_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void concatenate_width_x4(
+ TENSOR4D_DECLARATION(src1),
+ TENSOR4D_DECLARATION(src2),
+ TENSOR4D_DECLARATION(src3),
+ TENSOR4D_DECLARATION(src4),
+ TENSOR4D_DECLARATION(dst))
+{
+ Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+
+ // Calculate input indices
+ const int x = get_global_id(0) * (int)VEC_SIZE;
+ const int y = get_global_id(1);
+ const int z = get_global_id(2) % (int)DEPTH;
+ const int w = get_global_id(2) / (int)DEPTH;
+
+ const int x1 = min(x, (int)INPUT1_WIDTH);
+ const int x2 = min(max(x - (int)INPUT1_WIDTH, -(int)VEC_SIZE), (int)INPUT2_WIDTH);
+ const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, -(int)VEC_SIZE), (int)INPUT3_WIDTH);
+ const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, -(int)VEC_SIZE);
+
+ // Calculate inputs and output addresses
+ const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+ const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+ const __global uchar *in3_ptr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * (int)src3_stride_x + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
+ const __global uchar *in4_ptr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * (int)src4_stride_x + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
+
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr);
+
+ const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
+
+ const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in2 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+ const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in3 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH + INPUT2_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+ const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in4 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ values = select(src2_values, src1_values, cond_in2);
+ values = select(src3_values, values, cond_in3);
+ values = select(src4_values, values, cond_in4);
+
+ VSTORE(VEC_SIZE)
+ (values, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif /* defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH) */
+#endif /* defined(INPUT1_WIDTH) */
+#endif /* defined(DEPTH) && defined(ELEMENT_SIZE) */
+
+#if defined(WIDTH_OFFSET) && defined(DEPTH)
/** This kernel concatenates the input tensor into the output tensor along the first dimension
*
* @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
* @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -37,6 +245,8 @@
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
@@ -45,15 +255,16 @@
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] offset The offset to the first valid element of the output tensor in bytes
*/
__kernel void concatenate_width(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst))
{
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, DEPTH);
+ Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
@@ -61,10 +272,13 @@
VSTORE(VEC_SIZE)
(source_values, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET);
}
-#endif // defined(WIDTH_OFFSET)
+#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) */
/** This kernel concatenates the input tensor into the output tensor along the third dimension
*
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ *
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16, F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -97,4 +311,4 @@
VSTORE(VEC_SIZE)
(source_values, 0, (__global DATA_TYPE *)(dst.ptr + offsets.z));
}
-#endif // defined(DATA_TYPE)
\ No newline at end of file
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */
diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/copy_tensor.cl
index 930a676..4bbbf11 100644
--- a/src/core/CL/cl_kernels/copy_tensor.cl
+++ b/src/core/CL/cl_kernels/copy_tensor.cl
@@ -23,6 +23,60 @@
*/
#include "helpers.h"
+#if defined(PAD00) && defined(PAD10) && defined(PAD20) && defined(PAD21) && defined(PAD30) && defined(DATA_TYPE) && defined(VEC_SIZE) // Compile time constants
+
+/** Perform a padded copy of input tensor to the output tensor. Padding values are defined at compile time
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DPAD{d}{0,1} = padding before{0} and after{1} dimension d (d < 4)
+ * -# -DDEPTH = The third dimension (depth) of the tensor (it is needed only if d == 3)
+ * -# -DDATA_TYPE = Input and output datatypes.
+ *
+ * @param[in] in_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] in_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] in_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] in_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: same as @p in_ptr
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] out_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] out_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] out_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void copy_pad_tensor(
+ TENSOR3D_DECLARATION(in),
+ TENSOR3D_DECLARATION(out))
+
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(in);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ const int offset_x = PAD00;
+ const int offset_y = PAD10;
+ const int offset_z = PAD20;
+
+#if PAD30 > 0
+ const size_t in_batch = get_global_id(2) / DEPTH;
+ const int total_depth = DEPTH + PAD20 + PAD21;
+ const int offset_w = PAD30 * total_depth + in_batch * (PAD20 + PAD21);
+#else // PAD30 == 0
+ const int offset_w = 0;
+#endif // PAD30
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+
+ VSTORE(VEC_SIZE)
+ (data, 0, (__global DATA_TYPE *)tensor3D_offset(&out, offset_x, offset_y, offset_z + offset_w));
+}
+#endif // Compile time constants
+
/** Performs a copy of input tensor to the output tensor.
*
* @param[in] in_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
@@ -56,4 +110,4 @@
// Store result
VSTORE(VEC_SIZE)
(data, 0, (__global DATA_TYPE *)out.ptr);
-}
\ No newline at end of file
+}
diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/deconvolution_layer.cl
index e15482c..e5169f9 100644
--- a/src/core/CL/cl_kernels/deconvolution_layer.cl
+++ b/src/core/CL/cl_kernels/deconvolution_layer.cl
@@ -25,7 +25,7 @@
/** This function applies upsample on an input image.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F16/F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -33,7 +33,7 @@
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr Pointer to the destination image. Supported data types: F16/F32
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index 77a76b6..bfaa92b 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -24,7 +24,7 @@
#include "helpers.h"
-#if defined(DEPTH_MULTIPLIER)
+#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
#if defined(CONV_STRIDE_X)
#if CONV_STRIDE_X == 1
@@ -147,12 +147,12 @@
/** This OpenCL kernel computes the depthwise convolution 3x3
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
@@ -188,23 +188,28 @@
{
Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
#if defined(HAS_BIAS)
Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
#endif //defined(HAS_BIAS)
- src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+ // Extract channel and linearized batch indices
+ const int channel = get_global_id(2) % DST_CHANNELS;
+ const int batch = get_global_id(2) / DST_CHANNELS;
+ // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+ src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+ __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
- float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
- float3 weights_values1 = vload3(0, (__global float *)(weights.ptr + offset.s1));
- float3 weights_values2 = vload3(0, (__global float *)(weights.ptr + offset.s2));
+ float3 weights_values0 = vload3(0, (__global float *)(weights_addr + offset.s0));
+ float3 weights_values1 = vload3(0, (__global float *)(weights_addr + offset.s1));
+ float3 weights_values2 = vload3(0, (__global float *)(weights_addr + offset.s2));
float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
weights_values1.s0, weights_values1.s1, weights_values1.s2,
weights_values2.s0, weights_values2.s1, weights_values2.s2);
#if defined(HAS_BIAS)
- pixels += (float2)(*((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x)));
+ pixels += (float2)(*((__global float *)(biases.ptr + channel * biases_stride_x)));
#endif //defined(HAS_BIAS)
vstore2(pixels, 0, (__global float *)dst.ptr);
@@ -266,12 +271,12 @@
/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
* stride_x and stride_y are equal to 1
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
@@ -307,15 +312,19 @@
{
Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
float2 pixels0 = 0.0f;
float2 pixels1 = 0.0f;
float2 pixels2 = 0.0f;
float2 pixels3 = 0.0f;
- __global uchar *weights_addr = (__global uchar *)weights.ptr;
- __global uchar *src_addr = src.ptr - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+ // Extract channel and linearized batch indices
+ const int channel = get_global_id(2) % DST_CHANNELS;
+ const int batch = get_global_id(2) / DST_CHANNELS;
+ // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+ __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+ __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
// Load the weights
float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
@@ -346,7 +355,7 @@
#ifdef HAS_BIAS
Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
- float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
+ float bias = *((__global float *)(vector_offset(&biases, channel)));
pixels0 += (float2)bias;
pixels1 += (float2)bias;
@@ -363,12 +372,12 @@
/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
* stride_x and stride_y are equal to 2
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
@@ -404,13 +413,17 @@
{
Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
float2 pixels0 = 0.0f;
float2 pixels1 = 0.0f;
- __global uchar *weights_addr = (__global uchar *)weights.ptr;
- __global uchar *src_addr = src.ptr - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+ // Extract channel and linearized batch indices
+ const int channel = get_global_id(2) % DST_CHANNELS;
+ const int batch = get_global_id(2) / DST_CHANNELS;
+ // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+ __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+ __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
// Load the weights
float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
@@ -439,7 +452,7 @@
#ifdef HAS_BIAS
Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
- float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
+ float bias = *((__global float *)(vector_offset(&biases, channel)));
pixels0 += (float2)bias;
pixels1 += (float2)bias;
@@ -449,7 +462,7 @@
vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
}
-#endif // defined(DEPTH_MULTIPLIER)
+#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
#if defined(NCHW)
#define in_stride_x src_stride_x
@@ -617,7 +630,7 @@
#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
#if defined(CONV_STRIDE_X)
#if CONV_STRIDE_X == 1
#define convolution1x3_f16 convolution1x3_stride_1_f16
@@ -740,14 +753,14 @@
/** This OpenCL kernel computes the depthwise convolution 3x3
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -781,23 +794,28 @@
{
Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
#if defined(HAS_BIAS)
Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
#endif //defined(HAS_BIAS)
- src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+ // Extract channel and linearized batch indices
+ const int channel = get_global_id(2) % DST_CHANNELS;
+ const int batch = get_global_id(2) / DST_CHANNELS;
+ // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+ src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+ __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
- half3 weights_values0 = vload3(0, (__global half *)(weights.ptr + offset.s0));
- half3 weights_values1 = vload3(0, (__global half *)(weights.ptr + offset.s1));
- half3 weights_values2 = vload3(0, (__global half *)(weights.ptr + offset.s2));
+ half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
+ half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
+ half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
weights_values1.s0, weights_values1.s1, weights_values1.s2,
weights_values2.s0, weights_values2.s1, weights_values2.s2);
#if defined(HAS_BIAS)
- pixels += (half4)(*((__global half *)(biases.ptr + get_global_id(2) * biases_stride_x)));
+ pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
#endif //defined(HAS_BIAS)
vstore4(pixels, 0, (__global half *)dst.ptr);
@@ -808,14 +826,14 @@
/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
* when both stride_x and stride_y are equal to 1
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -849,12 +867,16 @@
{
Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+
+ // Extract channel and linearized batch indices
+ const int channel = get_global_id(2) % DST_CHANNELS;
+ const int batch = get_global_id(2) / DST_CHANNELS;
#ifdef HAS_BIAS
Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
- half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
+ half bias = *((__global half *)(vector_offset(&biases, channel)));
#endif /* defined(HAS_BIAS) */
half4 pixels0 = 0.0f;
@@ -862,8 +884,9 @@
half4 pixels2 = 0.0f;
half4 pixels3 = 0.0f;
- __global uchar *weights_addr = (__global uchar *)weights.ptr;
- __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+ // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+ __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+ __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
// Load the weights
half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
@@ -907,14 +930,14 @@
/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
* when both stride_x and stride_y are equal to 2
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -948,19 +971,24 @@
{
Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+
+ // Extract channel and linearized batch indices
+ const int channel = get_global_id(2) % DST_CHANNELS;
+ const int batch = get_global_id(2) / DST_CHANNELS;
#ifdef HAS_BIAS
Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
- half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
+ half bias = *((__global half *)(vector_offset(&biases, channel)));
#endif /* defined(HAS_BIAS) */
half4 pixels0 = 0.0f;
half4 pixels1 = 0.0f;
- __global uchar *weights_addr = (__global uchar *)weights.ptr;
- __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+ // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+ __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+ __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
// Load the weights
half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
@@ -994,15 +1022,20 @@
vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
}
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
-#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
+#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#if DATA_TYPE != float || DATA_TYPE != half
+#error "Unsupported data type"
+#endif // DATA_TYPE != float || DATA_TYPE != half
+
+#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
*
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
@@ -1010,14 +1043,16 @@
* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: FP32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: FP32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1025,6 +1060,8 @@
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
@@ -1041,8 +1078,8 @@
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
*/
__kernel void depthwise_convolution_3x3_nhwc(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(biases),
@@ -1051,11 +1088,20 @@
{
int x = get_global_id(0); // channels
int y = get_global_id(1); // spatial coordinate x
+#if defined(DST_DEPTH)
+ int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+ int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else /* defined(DST_DEPTH) */
int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(float) * VEC_SIZE;
+#if defined(DST_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
+#else /* defined(DST_DEPTH) */
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
int z_coord = 0;
int4 offset = 0;
@@ -1065,15 +1111,15 @@
VEC_FLOAT acc = 0;
// Load weights
- VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
- VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
- VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
- VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
- VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
- VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
- VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
- VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
- VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
+ VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
+ VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
+ VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
+ VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
+ VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
+ VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
+ VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
+ VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
+ VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
// Load input values
// z == 0
@@ -1085,27 +1131,27 @@
offset = y_offset + (int4)(z_coord * src_stride_z);
offset = min(offset, (int4)max_offset);
- VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s0));
- VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s1));
- VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s2));
+ VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+ VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+ VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
// z == 1
// z_coord can be only negative for z = 0 so we do not need to clamp it
// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;
offset = y_offset + (int4)(z_coord * src_stride_z);
- VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s0));
- VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s1));
- VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s2));
+ VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+ VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+ VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
// z == 2
// After z = 1 we can simply add src_stride_z to offset without updating z_coord
// However offset can be out-of-bound so we need to check if it is greater than max_offset
offset += (int4)src_stride_z;
offset = min(offset, (int4)max_offset);
- VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s0));
- VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s1));
- VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s2));
+ VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+ VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+ VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
acc = fma(values0, w0, acc);
acc = fma(values1, w1, acc);
@@ -1121,13 +1167,18 @@
#if defined(HAS_BIAS)
Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
- VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global float *)biases.ptr);
+ VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
acc += bias_values;
#endif // defined(HAS_BIAS)
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+#if defined(DST_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
+#else /* defined(DST_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
+#endif /* defined(DST_DEPTH) */
+
VSTORE(VEC_SIZE)
- (acc, 0, (__global float *)(dst.ptr));
+ (acc, 0, (__global DATA_TYPE *)(dst_addr));
}
#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
@@ -1141,14 +1192,16 @@
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: FP32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: FP32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1156,6 +1209,8 @@
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
@@ -1172,8 +1227,8 @@
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
*/
__kernel void depthwise_convolution_3x3_nhwc_stride1(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(biases),
@@ -1182,11 +1237,20 @@
{
int x = get_global_id(0); // channels
int y = get_global_id(1); // spatial coordinate x
+#if defined(DST_DEPTH)
+ int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+ int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else /* defined(DST_DEPTH) */
int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(float) * VEC_SIZE;
+#if defined(DST_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
+#else /* defined(DST_DEPTH) */
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
int z_coord = 0;
int4 offset = 0;
@@ -1199,15 +1263,15 @@
VEC_FLOAT acc3 = 0;
// Load weights
- VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
- VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
- VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
- VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
- VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
- VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
- VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
- VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
- VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
+ VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
+ VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
+ VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
+ VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
+ VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
+ VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
+ VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
+ VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
+ VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
// Load input values
// z == 0
@@ -1219,40 +1283,40 @@
offset = y_offset + (int4)(z_coord * src_stride_z);
offset = min(offset, (int4)max_offset);
- VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s0));
- VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s1));
- VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s2));
- VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s3));
+ VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+ VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+ VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+ VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
// z == 1
// z_coord can be only negative for z = 0 so we do not need to clamp it
// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
offset = y_offset + (int4)(z_coord * src_stride_z);
- VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s0));
- VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s1));
- VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s2));
- VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s3));
+ VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+ VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+ VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+ VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
// z == 2
// After z = 1 we can simply add src_stride_z to offset without updating z_coord
// However offset can be out-of-bound so we need to check if it is greater than max_offset
offset += (int4)src_stride_z;
offset = min(offset, (int4)max_offset);
- VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s0));
- VEC_FLOAT values9 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s1));
- VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s2));
- VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s3));
+ VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+ VEC_FLOAT values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+ VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+ VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
// z == 3
// After z = 1 we can simply add src_stride_z to offset without updating z_coord
// However offset can be out-of-bound so we need to check if it is greater than max_offset
offset += (int4)src_stride_z;
offset = min(offset, (int4)max_offset);
- VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s0));
- VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s1));
- VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s2));
- VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s3));
+ VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+ VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+ VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+ VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
acc0 = fma(values0, w0, acc0);
acc0 = fma(values1, w1, acc0);
@@ -1299,7 +1363,7 @@
#if defined(HAS_BIAS)
Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
- VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global float *)biases.ptr);
+ VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
acc0 += bias_values;
acc1 += bias_values;
@@ -1307,23 +1371,27 @@
acc3 += bias_values;
#endif // defined(HAS_BIAS)
+#if defined(DST_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
+#else /* defined(DST_DEPTH) */
__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
+#endif /* defined(DST_DEPTH) */
VSTORE(VEC_SIZE)
- (acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+ (acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
VSTORE(VEC_SIZE)
- (acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y));
+ (acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
{
VSTORE(VEC_SIZE)
- (acc2, 0, (__global float *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
+ (acc2, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
VSTORE(VEC_SIZE)
- (acc3, 0, (__global float *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
+ (acc3, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
}
}
#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
-#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
\ No newline at end of file
+#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index fe902ed..5a732b4 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -24,7 +24,7 @@
#include "helpers_asymm.h"
-#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
#if defined(FUSED_ACTIVATION)
#define DATA_TYPE uchar
@@ -37,21 +37,21 @@
#define ACTIVATION_FUNC(x) (x)
#endif /* defined(FUSED_ACTIVATION) */
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
-#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val = arm_dot_acc((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3), val);
-#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
-#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val += arm_dot((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3));
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), val);
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER)
+#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
#if CONV_STRIDE_X > 3
#error "Stride X not supported"
#endif /* CONV_STRIDE_X > 3 */
-#if !defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#if !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))
#if CONV_STRIDE_X == 1
#define GET_VALUES(first_value, left, middle, right) \
@@ -87,14 +87,14 @@
/** This function computes the depthwise convolution quantized.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -129,18 +129,25 @@
{
Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+
+ // Extract channel and linearized batch indices
+ const int channel = get_global_id(2) % DST_CHANNELS;
+ const int batch = get_global_id(2) / DST_CHANNELS;
+
#if defined(HAS_BIAS)
Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
- int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
+ int bias_value = *((__global int *)(vector_offset(&biases, channel)));
#endif //defined(HAS_BIAS)
- src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+ // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+ src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+ __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
- uchar3 w0 = vload3(0, weights.ptr + 0 * weights_stride_y);
- uchar3 w1 = vload3(0, weights.ptr + 1 * weights_stride_y);
- uchar3 w2 = vload3(0, weights.ptr + 2 * weights_stride_y);
+ uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y);
+ uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y);
+ uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y);
int8 values0 = 0;
int8 sum0 = 0;
@@ -241,7 +248,16 @@
#endif /* CONV_STRIDE_Y == 1 */
#endif /* K_OFFSET != 0 */
+#if defined(REAL_MULTIPLIER)
+
+ values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
+
+#else // defined(REAL_MULTIPLIER)
+
values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+
+#endif // defined(REAL_MULTIPLIER)
+
values0 += (int8)OUTPUT_OFFSET;
uchar8 res0 = convert_uchar8_sat(values0);
res0 = max(res0, (uchar8)0);
@@ -249,8 +265,16 @@
vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
#if CONV_STRIDE_Y == 1
+#if defined(REAL_MULTIPLIER)
+
+ values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
+
+#else // defined(REAL_MULTIPLIER)
values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+
+#endif // defined(REAL_MULTIPLIER)
+
values1 += (int8)OUTPUT_OFFSET;
uchar8 res1 = convert_uchar8_sat(values1);
res1 = max(res1, (uchar8)0);
@@ -260,7 +284,7 @@
#endif /* CONV_STRIDE_Y == 1 */
}
-#else // !defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#else // !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))
#if CONV_STRIDE_X == 1
#define GET_VALUES(first_value, left, middle, right) \
@@ -295,14 +319,14 @@
#endif /* CONV_STRIDE_X */
/** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -337,18 +361,25 @@
{
Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
-#if defined(HAS_BIAS)
- Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
- const int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
+ // Extract channel and linearized batch indices
+ const int channel = get_global_id(2) % DST_CHANNELS;
+ const int batch = get_global_id(2) / DST_CHANNELS;
+
+#if defined(HAS_BIAS)
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+ const int bias_value = *((__global int *)(vector_offset(&biases, channel)));
#endif //defined(HAS_BIAS)
- src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+ // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+ src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+ __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
- uchar3 w0 = vload3(0, weights.ptr + 0 * weights_stride_y);
- uchar3 w1 = vload3(0, weights.ptr + 1 * weights_stride_y);
- uchar3 w2 = vload3(0, weights.ptr + 2 * weights_stride_y);
+ uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y);
+ uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y);
+ uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y);
uchar8 left0, middle0, right0;
uchar8 left1, middle1, right1;
@@ -383,69 +414,69 @@
#endif /* WEIGHTS_OFFSET != 0 */
#endif // CONV_STRIDE_Y == 1
- ARM_DOT(left0.s0, middle0.s0, right0.s0, left1.s0, w0.s0, w0.s1, w0.s2, w1.s0, values0.s0);
- ARM_DOT(middle1.s0, right1.s0, left2.s0, middle2.s0, w1.s1, w1.s2, w2.s0, w2.s1, values0.s0);
+ ARM_DOT((uchar4)(left0.s0, middle0.s0, right0.s0, left1.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0);
+ ARM_DOT((uchar4)(middle1.s0, right1.s0, left2.s0, middle2.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0);
values0.s0 += right2.s0 * w2.s2;
- ARM_DOT(left0.s1, middle0.s1, right0.s1, left1.s1, w0.s0, w0.s1, w0.s2, w1.s0, values0.s1);
- ARM_DOT(middle1.s1, right1.s1, left2.s1, middle2.s1, w1.s1, w1.s2, w2.s0, w2.s1, values0.s1);
+ ARM_DOT((uchar4)(left0.s1, middle0.s1, right0.s1, left1.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1);
+ ARM_DOT((uchar4)(middle1.s1, right1.s1, left2.s1, middle2.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1);
values0.s1 += right2.s1 * w2.s2;
- ARM_DOT(left0.s2, middle0.s2, right0.s2, left1.s2, w0.s0, w0.s1, w0.s2, w1.s0, values0.s2);
- ARM_DOT(middle1.s2, right1.s2, left2.s2, middle2.s2, w1.s1, w1.s2, w2.s0, w2.s1, values0.s2);
+ ARM_DOT((uchar4)(left0.s2, middle0.s2, right0.s2, left1.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2);
+ ARM_DOT((uchar4)(middle1.s2, right1.s2, left2.s2, middle2.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2);
values0.s2 += right2.s2 * w2.s2;
- ARM_DOT(left0.s3, middle0.s3, right0.s3, left1.s3, w0.s0, w0.s1, w0.s2, w1.s0, values0.s3);
- ARM_DOT(middle1.s3, right1.s3, left2.s3, middle2.s3, w1.s1, w1.s2, w2.s0, w2.s1, values0.s3);
+ ARM_DOT((uchar4)(left0.s3, middle0.s3, right0.s3, left1.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3);
+ ARM_DOT((uchar4)(middle1.s3, right1.s3, left2.s3, middle2.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3);
values0.s3 += right2.s3 * w2.s2;
- ARM_DOT(left0.s4, middle0.s4, right0.s4, left1.s4, w0.s0, w0.s1, w0.s2, w1.s0, values0.s4);
- ARM_DOT(middle1.s4, right1.s4, left2.s4, middle2.s4, w1.s1, w1.s2, w2.s0, w2.s1, values0.s4);
+ ARM_DOT((uchar4)(left0.s4, middle0.s4, right0.s4, left1.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4);
+ ARM_DOT((uchar4)(middle1.s4, right1.s4, left2.s4, middle2.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4);
values0.s4 += right2.s4 * w2.s2;
- ARM_DOT(left0.s5, middle0.s5, right0.s5, left1.s5, w0.s0, w0.s1, w0.s2, w1.s0, values0.s5);
- ARM_DOT(middle1.s5, right1.s5, left2.s5, middle2.s5, w1.s1, w1.s2, w2.s0, w2.s1, values0.s5);
+ ARM_DOT((uchar4)(left0.s5, middle0.s5, right0.s5, left1.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5);
+ ARM_DOT((uchar4)(middle1.s5, right1.s5, left2.s5, middle2.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5);
values0.s5 += right2.s5 * w2.s2;
- ARM_DOT(left0.s6, middle0.s6, right0.s6, left1.s6, w0.s0, w0.s1, w0.s2, w1.s0, values0.s6);
- ARM_DOT(middle1.s6, right1.s6, left2.s6, middle2.s6, w1.s1, w1.s2, w2.s0, w2.s1, values0.s6);
+ ARM_DOT((uchar4)(left0.s6, middle0.s6, right0.s6, left1.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6);
+ ARM_DOT((uchar4)(middle1.s6, right1.s6, left2.s6, middle2.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6);
values0.s6 += right2.s6 * w2.s2;
- ARM_DOT(left0.s7, middle0.s7, right0.s7, left1.s7, w0.s0, w0.s1, w0.s2, w1.s0, values0.s7);
- ARM_DOT(middle1.s7, right1.s7, left2.s7, middle2.s7, w1.s1, w1.s2, w2.s0, w2.s1, values0.s7);
+ ARM_DOT((uchar4)(left0.s7, middle0.s7, right0.s7, left1.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7);
+ ARM_DOT((uchar4)(middle1.s7, right1.s7, left2.s7, middle2.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7);
values0.s7 += right2.s7 * w2.s2;
#if CONV_STRIDE_Y == 1
- ARM_DOT(left1.s0, middle1.s0, right1.s0, left2.s0, w0.s0, w0.s1, w0.s2, w1.s0, values1.s0);
- ARM_DOT(middle2.s0, right2.s0, left3.s0, middle3.s0, w1.s1, w1.s2, w2.s0, w2.s1, values1.s0);
+ ARM_DOT((uchar4)(left1.s0, middle1.s0, right1.s0, left2.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0);
+ ARM_DOT((uchar4)(middle2.s0, right2.s0, left3.s0, middle3.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0);
values1.s0 += right3.s0 * w2.s2;
- ARM_DOT(left1.s1, middle1.s1, right1.s1, left2.s1, w0.s0, w0.s1, w0.s2, w1.s0, values1.s1);
- ARM_DOT(middle2.s1, right2.s1, left3.s1, middle3.s1, w1.s1, w1.s2, w2.s0, w2.s1, values1.s1);
+ ARM_DOT((uchar4)(left1.s1, middle1.s1, right1.s1, left2.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1);
+ ARM_DOT((uchar4)(middle2.s1, right2.s1, left3.s1, middle3.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1);
values1.s1 += right3.s1 * w2.s2;
- ARM_DOT(left1.s2, middle1.s2, right1.s2, left2.s2, w0.s0, w0.s1, w0.s2, w1.s0, values1.s2);
- ARM_DOT(middle2.s2, right2.s2, left3.s2, middle3.s2, w1.s1, w1.s2, w2.s0, w2.s1, values1.s2);
+ ARM_DOT((uchar4)(left1.s2, middle1.s2, right1.s2, left2.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2);
+ ARM_DOT((uchar4)(middle2.s2, right2.s2, left3.s2, middle3.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2);
values1.s2 += right3.s2 * w2.s2;
- ARM_DOT(left1.s3, middle1.s3, right1.s3, left2.s3, w0.s0, w0.s1, w0.s2, w1.s0, values1.s3);
- ARM_DOT(middle2.s3, right2.s3, left3.s3, middle3.s3, w1.s1, w1.s2, w2.s0, w2.s1, values1.s3);
+ ARM_DOT((uchar4)(left1.s3, middle1.s3, right1.s3, left2.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3);
+ ARM_DOT((uchar4)(middle2.s3, right2.s3, left3.s3, middle3.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3);
values1.s3 += right3.s3 * w2.s2;
- ARM_DOT(left1.s4, middle1.s4, right1.s4, left2.s4, w0.s0, w0.s1, w0.s2, w1.s0, values1.s4);
- ARM_DOT(middle2.s4, right2.s4, left3.s4, middle3.s4, w1.s1, w1.s2, w2.s0, w2.s1, values1.s4);
+ ARM_DOT((uchar4)(left1.s4, middle1.s4, right1.s4, left2.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4);
+ ARM_DOT((uchar4)(middle2.s4, right2.s4, left3.s4, middle3.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4);
values1.s4 += right3.s4 * w2.s2;
- ARM_DOT(left1.s5, middle1.s5, right1.s5, left2.s5, w0.s0, w0.s1, w0.s2, w1.s0, values1.s5);
- ARM_DOT(middle2.s5, right2.s5, left3.s5, middle3.s5, w1.s1, w1.s2, w2.s0, w2.s1, values1.s5);
+ ARM_DOT((uchar4)(left1.s5, middle1.s5, right1.s5, left2.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5);
+ ARM_DOT((uchar4)(middle2.s5, right2.s5, left3.s5, middle3.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5);
values1.s5 += right3.s5 * w2.s2;
- ARM_DOT(left1.s6, middle1.s6, right1.s6, left2.s6, w0.s0, w0.s1, w0.s2, w1.s0, values1.s6);
- ARM_DOT(middle2.s6, right2.s6, left3.s6, middle3.s6, w1.s1, w1.s2, w2.s0, w2.s1, values1.s6);
+ ARM_DOT((uchar4)(left1.s6, middle1.s6, right1.s6, left2.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6);
+ ARM_DOT((uchar4)(middle2.s6, right2.s6, left3.s6, middle3.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6);
values1.s6 += right3.s6 * w2.s2;
- ARM_DOT(left1.s7, middle1.s7, right1.s7, left2.s7, w0.s0, w0.s1, w0.s2, w1.s0, values1.s7);
- ARM_DOT(middle2.s7, right2.s7, left3.s7, middle3.s7, w1.s1, w1.s2, w2.s0, w2.s1, values1.s7);
+ ARM_DOT((uchar4)(left1.s7, middle1.s7, right1.s7, left2.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7);
+ ARM_DOT((uchar4)(middle2.s7, right2.s7, left3.s7, middle3.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7);
values1.s7 += right3.s7 * w2.s2;
#endif // CONV_STRIDE_Y == 1
@@ -480,7 +511,16 @@
#endif /* CONV_STRIDE_Y == 1 */
#endif /* K_OFFSET != 0 */
+#if defined(REAL_MULTIPLIER)
+
+ values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
+
+#else // defined(REAL_MULTIPLIER)
+
values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+
+#endif // defined(REAL_MULTIPLIER)
+
values0 += (int8)OUTPUT_OFFSET;
uchar8 res0 = convert_uchar8_sat(values0);
res0 = max(res0, (uchar8)0);
@@ -489,7 +529,16 @@
vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
#if CONV_STRIDE_Y == 1
+#if defined(REAL_MULTIPLIER)
+
+ values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
+
+#else // defined(REAL_MULTIPLIER)
+
values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+
+#endif // defined(REAL_MULTIPLIER)
+
values1 += (int8)OUTPUT_OFFSET;
uchar8 res1 = convert_uchar8_sat(values1);
res1 = max(res1, (uchar8)0);
@@ -499,15 +548,16 @@
#endif /* CONV_STRIDE_Y == 1 */
}
-#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) */
+#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) */
#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)
#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
#define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE)
@@ -523,37 +573,66 @@
#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)
#endif /* WEIGHTS_OFFSET != 0 */
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \
({ \
- ARM_DOT(val0.s0, val1.s0, val2.s0, val3.s0, w0.s0, w1.s0, w2.s0, w3.s0, acc.s0); \
- ARM_DOT(val4.s0, val5.s0, val6.s0, val7.s0, w4.s0, w5.s0, w6.s0, w7.s0, acc.s0); \
+ ARM_DOT((uchar4)(val0.s0, val1.s0, val2.s0, val3.s0), (uchar4)(w0.s0, w1.s0, w2.s0, w3.s0), acc.s0); \
+ ARM_DOT((uchar4)(val4.s0, val5.s0, val6.s0, val7.s0), (uchar4)(w4.s0, w5.s0, w6.s0, w7.s0), acc.s0); \
acc.s0 += val8.s0 * w8.s0; \
\
- ARM_DOT(val0.s1, val1.s1, val2.s1, val3.s1, w0.s1, w1.s1, w2.s1, w3.s1, acc.s1); \
- ARM_DOT(val4.s1, val5.s1, val6.s1, val7.s1, w4.s1, w5.s1, w6.s1, w7.s1, acc.s1); \
+ ARM_DOT((uchar4)(val0.s1, val1.s1, val2.s1, val3.s1), (uchar4)(w0.s1, w1.s1, w2.s1, w3.s1), acc.s1); \
+ ARM_DOT((uchar4)(val4.s1, val5.s1, val6.s1, val7.s1), (uchar4)(w4.s1, w5.s1, w6.s1, w7.s1), acc.s1); \
acc.s1 += val8.s1 * w8.s1; \
\
- ARM_DOT(val0.s2, val1.s2, val2.s2, val3.s2, w0.s2, w1.s2, w2.s2, w3.s2, acc.s2); \
- ARM_DOT(val4.s2, val5.s2, val6.s2, val7.s2, w4.s2, w5.s2, w6.s2, w7.s2, acc.s2); \
+ ARM_DOT((uchar4)(val0.s2, val1.s2, val2.s2, val3.s2), (uchar4)(w0.s2, w1.s2, w2.s2, w3.s2), acc.s2); \
+ ARM_DOT((uchar4)(val4.s2, val5.s2, val6.s2, val7.s2), (uchar4)(w4.s2, w5.s2, w6.s2, w7.s2), acc.s2); \
acc.s2 += val8.s2 * w8.s2; \
\
- ARM_DOT(val0.s3, val1.s3, val2.s3, val3.s3, w0.s3, w1.s3, w2.s3, w3.s3, acc.s3); \
- ARM_DOT(val4.s3, val5.s3, val6.s3, val7.s3, w4.s3, w5.s3, w6.s3, w7.s3, acc.s3); \
+ ARM_DOT((uchar4)(val0.s3, val1.s3, val2.s3, val3.s3), (uchar4)(w0.s3, w1.s3, w2.s3, w3.s3), acc.s3); \
+ ARM_DOT((uchar4)(val4.s3, val5.s3, val6.s3, val7.s3), (uchar4)(w4.s3, w5.s3, w6.s3, w7.s3), acc.s3); \
acc.s3 += val8.s3 * w8.s3; \
})
#if WEIGHTS_OFFSET != 0
-#define DOT_PRODUCT_ACCUMULATE(acc, sum, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \
- ({ \
- sum += CONVERT(val0, VEC_INT) + CONVERT(val1, VEC_INT) + CONVERT(val2, VEC_INT) + CONVERT(val3, VEC_INT) + CONVERT(val4, VEC_INT) + CONVERT(val5, VEC_INT) + CONVERT(val6, VEC_INT) + CONVERT(val7, VEC_INT) + CONVERT(val8, VEC_INT); \
- DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8); \
+#define DOT_PRODUCT_ACCUMULATE(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \
+ ({ \
+ ARM_DOT((uchar4)(w0.s0, w1.s0, w2.s0, w3.s0), (uchar4)(val0.s0, val1.s0, val2.s0, val3.s0), acc.s0); \
+ ARM_DOT((uchar4)(w4.s0, w5.s0, w6.s0, w7.s0), (uchar4)(val4.s0, val5.s0, val6.s0, val7.s0), acc.s0); \
+ ARM_DOT((uchar4)(w8.s0, 0, 0, 0), (uchar4)val8.s0, acc.s0); \
+ \
+ ARM_DOT((uchar4)(w0.s1, w1.s1, w2.s1, w3.s1), (uchar4)(val0.s1, val1.s1, val2.s1, val3.s1), acc.s1); \
+ ARM_DOT((uchar4)(w4.s1, w5.s1, w6.s1, w7.s1), (uchar4)(val4.s1, val5.s1, val6.s1, val7.s1), acc.s1); \
+ ARM_DOT((uchar4)(w8.s1, 0, 0, 0), (uchar4)val8.s1, acc.s1); \
+ \
+ ARM_DOT((uchar4)(w0.s2, w1.s2, w2.s2, w3.s2), (uchar4)(val0.s2, val1.s2, val2.s2, val3.s2), acc.s2); \
+ ARM_DOT((uchar4)(w4.s2, w5.s2, w6.s2, w7.s2), (uchar4)(val4.s2, val5.s2, val6.s2, val7.s2), acc.s2); \
+ ARM_DOT((uchar4)(w8.s2, 0, 0, 0), (uchar4)val8.s2, acc.s2); \
+ \
+ ARM_DOT((uchar4)(w0.s3, w1.s3, w2.s3, w3.s3), (uchar4)(val0.s3, val1.s3, val2.s3, val3.s3), acc.s3); \
+ ARM_DOT((uchar4)(w4.s3, w5.s3, w6.s3, w7.s3), (uchar4)(val4.s3, val5.s3, val6.s3, val7.s3), acc.s3); \
+ ARM_DOT((uchar4)(w8.s3, 0, 0, 0), (uchar4)val8.s3, acc.s3); \
})
#else /* WEIGHTS_OFFSET != 0 */
-#define DOT_PRODUCT_ACCUMULATE(acc, sum, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8)
+#define DOT_PRODUCT_ACCUMULATE(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8)
#endif /* WEIGHTS_OFFSET != 0 */
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \
+ ({ \
+ sum = CONVERT(val0, VEC_INT); \
+ ARM_DOT((uchar4)(val1.s0, val2.s0, val3.s0, val4.s0), (uchar4)1, sum.s0); \
+ ARM_DOT((uchar4)(val5.s0, val6.s0, val7.s0, val8.s0), (uchar4)1, sum.s0); \
+ \
+ ARM_DOT((uchar4)(val1.s1, val2.s1, val3.s1, val4.s1), (uchar4)1, sum.s1); \
+ ARM_DOT((uchar4)(val5.s1, val6.s1, val7.s1, val8.s1), (uchar4)1, sum.s1); \
+ \
+ ARM_DOT((uchar4)(val1.s2, val2.s2, val3.s2, val4.s2), (uchar4)1, sum.s2); \
+ ARM_DOT((uchar4)(val5.s2, val6.s2, val7.s2, val8.s2), (uchar4)1, sum.s2); \
+ \
+ ARM_DOT((uchar4)(val1.s3, val2.s3, val3.s3, val4.s3), (uchar4)1, sum.s3); \
+ ARM_DOT((uchar4)(val5.s3, val6.s3, val7.s3, val8.s3), (uchar4)1, sum.s3); \
+ })
+
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
@@ -565,14 +644,16 @@
* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -580,6 +661,8 @@
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
@@ -596,8 +679,8 @@
* @param[in] max_offset Max offset for the input tensor
*/
__kernel void depthwise_convolution_3x3_quantized_nhwc(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(biases),
@@ -606,17 +689,34 @@
{
const int x = get_global_id(0); // channels
const int y = get_global_id(1); // spatial coordinate x
- const int z = get_global_id(2); // spatial coordinate y
+#if defined(DST_DEPTH)
+ int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+ int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else /* defined(DST_DEPTH) */
+ int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+#if defined(DST_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
+#else /* defined(DST_DEPTH) */
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
- int z_coord = 0;
- int4 offset = 0;
- const int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
+ int z_coord = 0;
+ int4 offset = 0;
+ int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
- // We compute 2x1x1 [C,W,H] elements
+ // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
+ y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
+ y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
+ y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
+ y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
+
+ int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
+
+ // We compute 4x1x1 [C,W,H] elements
VEC_INT acc = 0, sum = 0;
// Load weights
@@ -698,15 +798,28 @@
acc += (VEC_INT)K_OFFSET;
#endif /* K_OFFSET != 0 */
+#if defined(REAL_MULTIPLIER)
+
+ acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+
+#else // defined(REAL_MULTIPLIER)
+
acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+#endif // defined(REAL_MULTIPLIER)
+
acc += (VEC_INT)OUTPUT_OFFSET;
VEC_UCHAR res = CONVERT_SAT(acc, VEC_UCHAR);
res = CLAMP(res, (VEC_UCHAR)0, (VEC_UCHAR)255);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+#if defined(DST_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
+#else /* defined(DST_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
+#endif /* defined(DST_DEPTH) */
+
VSTORE(VEC_SIZE)
- (res, 0, dst.ptr);
+ (ACTIVATION_FUNC(res), 0, dst_addr);
}
#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
@@ -720,14 +833,16 @@
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -735,6 +850,8 @@
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
@@ -752,8 +869,8 @@
*/
__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(biases),
@@ -762,17 +879,34 @@
{
int x = get_global_id(0);
int y = get_global_id(1);
- int z = get_global_id(2);
+#if defined(DST_DEPTH)
+ int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+ int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else /* defined(DST_DEPTH) */
+ int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+#if defined(DST_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
+#else /* defined(DST_DEPTH) */
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
- int z_coord = 0;
- int4 offset = 0;
- int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
+ int z_coord = 0;
+ int4 offset = 0;
+ int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
- // We compute 2x2x2 [C,W,H] elements
+ // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
+ y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
+ y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
+ y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
+ y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
+
+ int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
+
+ // We compute 4x2x2 [C,W,H] elements
VEC_INT acc0 = 0, sum0 = 0;
VEC_INT acc1 = 0, sum1 = 0;
VEC_INT acc2 = 0, sum2 = 0;
@@ -916,11 +1050,22 @@
acc3 += (VEC_INT)K_OFFSET;
#endif /* K_OFFSET != 0 */
+#if defined(REAL_MULTIPLIER)
+
+ acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+ acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+ acc2 = CONVERT(round(CONVERT(acc2, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+ acc3 = CONVERT(round(CONVERT(acc3, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+
+#else // defined(REAL_MULTIPLIER)
+
acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+#endif // defined(REAL_MULTIPLIER)
+
acc0 += (VEC_INT)OUTPUT_OFFSET;
acc1 += (VEC_INT)OUTPUT_OFFSET;
acc2 += (VEC_INT)OUTPUT_OFFSET;
@@ -936,25 +1081,29 @@
res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);
+#if defined(DST_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
+#else /* defined(DST_DEPTH) */
__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
+#endif /* defined(DST_DEPTH) */
VSTORE(VEC_SIZE)
- (res0, 0, dst_addr + 0 * dst_stride_y);
+ (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);
VSTORE(VEC_SIZE)
- (res1, 0, dst_addr + 1 * dst_stride_y);
+ (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);
#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
{
VSTORE(VEC_SIZE)
- (res2, 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);
+ (ACTIVATION_FUNC(res2), 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);
VSTORE(VEC_SIZE)
- (res3, 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);
+ (ACTIVATION_FUNC(res3), 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);
}
}
-#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product
*
* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
@@ -963,15 +1112,19 @@
* @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
+ * @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication.
+ * If not, the quantization will be performed using a fixed point multiplication
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -979,6 +1132,8 @@
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
@@ -992,34 +1147,52 @@
* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
+ * @param[in] max_offset The maximum allowed offset for the input tensor
*/
__kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
TENSOR3D_DECLARATION(weights),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
+#endif // defined(HAS_BIAS)
int max_offset)
{
int x = get_global_id(0);
int y = get_global_id(1);
- int z = get_global_id(2);
+#if defined(DST_DEPTH)
+ int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+ int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else /* defined(DST_DEPTH) */
+ int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+#if defined(DST_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
+#else /* defined(DST_DEPTH) */
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
- int z_coord = 0;
- int4 offset = 0;
- int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
+ int z_coord = 0;
+ int4 offset = 0;
+ int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
- // We compute 2x2x2 [C,W,H] elements
- VEC_INT acc0 = 0, sum0 = 0;
- VEC_INT acc1 = 0, sum1 = 0;
- VEC_INT acc2 = 0, sum2 = 0;
- VEC_INT acc3 = 0, sum3 = 0;
+ // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
+ y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
+ y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
+ y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
+ y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
+
+ int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
+
+ // We compute 4x2x1 [C,W,H] elements
+ VEC_INT acc0 = 0;
+ VEC_INT acc1 = 0;
+ VEC_INT sum0 = 0;
+ VEC_INT sum1 = 0;
// Load weights
VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);
@@ -1033,17 +1206,21 @@
VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
#if INPUT_OFFSET != 0
- VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
- + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
- + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
-#endif /* INPUT_OFFSET != 0 */
+ // Initilize the final result with the weights reduction multiplied by INPUT_OFFSET
+ DOT_PRODUCT_REDUCTION(acc0, w0, w1, w2, w3, w4, w5, w6, w7, w8);
+
+ // Multiply the weights reduction with INPUT_OFFSET
+ acc0 = INPUT_OFFSET * acc0;
+
+ acc1 = acc0;
+#endif // INPUT_OFFSET != 0
// Load input values
// z == 0
// Clamp z_coord as for z = 0, it can be negative
// z_coord is casted to unsigned int in order to use just a min() operation
// A "-1" 32 bit signed variable converted to unsigned gives 4294967295
- z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
+ z_coord = z - (int)CONV_PAD_TOP;
z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
offset = y_offset + (int4)(z_coord * src_stride_z);
offset = min(offset, (int4)max_offset);
@@ -1056,7 +1233,7 @@
// z == 1
// z_coord can be only negative for z = 0 so we do not need to clamp it
// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
- z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
+ z_coord = z - (int)CONV_PAD_TOP + 1;
offset = y_offset + (int4)(z_coord * src_stride_z);
VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
@@ -1073,20 +1250,11 @@
VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
- // z == 3
- // After z = 1 we can simply add src_stride_z to offset without updating z_coord
- // However offset can be out-of-bound so we need to check if it is greater than max_offset
- offset += (int4)(src_stride_z);
- offset = min(offset, (int4)max_offset);
- VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
- VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
- VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
- VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
+ DOT_PRODUCT_REDUCTION(sum0, values0, values1, values2, values4, values5, values6, values8, values9, values10);
+ DOT_PRODUCT_ACCUMULATE(acc0, values0, values1, values2, values4, values5, values6, values8, values9, values10, w0, w1, w2, w3, w4, w5, w6, w7, w8);
- DOT_PRODUCT_ACCUMULATE(acc0, sum0, values0, values1, values2, values4, values5, values6, values8, values9, values10, w0, w1, w2, w3, w4, w5, w6, w7, w8);
- DOT_PRODUCT_ACCUMULATE(acc1, sum1, values1, values2, values3, values5, values6, values7, values9, values10, values11, w0, w1, w2, w3, w4, w5, w6, w7, w8);
- DOT_PRODUCT_ACCUMULATE(acc2, sum2, values4, values5, values6, values8, values9, values10, values12, values13, values14, w0, w1, w2, w3, w4, w5, w6, w7, w8);
- DOT_PRODUCT_ACCUMULATE(acc3, sum3, values5, values6, values7, values9, values10, values11, values13, values14, values15, w0, w1, w2, w3, w4, w5, w6, w7, w8);
+ DOT_PRODUCT_REDUCTION(sum1, values1, values2, values3, values5, values6, values7, values9, values10, values11);
+ DOT_PRODUCT_ACCUMULATE(acc1, values1, values2, values3, values5, values6, values7, values9, values10, values11, w0, w1, w2, w3, w4, w5, w6, w7, w8);
#if defined(HAS_BIAS)
Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
@@ -1095,74 +1263,56 @@
acc0 += bias_values;
acc1 += bias_values;
- acc2 += bias_values;
- acc3 += bias_values;
-#endif /* defined(HAS_BIAS) */
+
+#endif // defined(HAS_BIAS)
#if WEIGHTS_OFFSET != 0
acc0 += WEIGHTS_OFFSET * sum0;
acc1 += WEIGHTS_OFFSET * sum1;
- acc2 += WEIGHTS_OFFSET * sum2;
- acc3 += WEIGHTS_OFFSET * sum3;
-#endif /* WEIGHTS_OFFSET != 0 */
-
-#if INPUT_OFFSET != 0
- VEC_INT offs = INPUT_OFFSET * sum_we;
-
- acc0 += offs;
- acc1 += offs;
- acc2 += offs;
- acc3 += offs;
-#endif /* INPUT_OFFSET != 0 */
+#endif // WEIGHTS_OFFSET != 0
#if K_OFFSET != 0
acc0 += (VEC_INT)K_OFFSET;
acc1 += (VEC_INT)K_OFFSET;
- acc2 += (VEC_INT)K_OFFSET;
- acc3 += (VEC_INT)K_OFFSET;
-#endif /* K_OFFSET != 0 */
+
+#endif // K_OFFSET != 0
+
+#if defined(REAL_MULTIPLIER)
+
+ acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+ acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+
+#else // defined(REAL_MULTIPLIER)
acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
- acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
- acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+#endif // defined(REAL_MULTIPLIER)
acc0 += (VEC_INT)OUTPUT_OFFSET;
acc1 += (VEC_INT)OUTPUT_OFFSET;
- acc2 += (VEC_INT)OUTPUT_OFFSET;
- acc3 += (VEC_INT)OUTPUT_OFFSET;
VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);
- VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);
- VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);
res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);
- res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
- res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
+#if defined(DST_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
+#else /* defined(DST_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
+#endif /* defined(DST_DEPTH) */
VSTORE(VEC_SIZE)
- (res0, 0, dst_addr + 0 * dst_stride_y);
+ (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);
VSTORE(VEC_SIZE)
- (res1, 0, dst_addr + 1 * dst_stride_y);
-
-#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
- if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
-#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
- {
- VSTORE(VEC_SIZE)
- (res2, 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);
- VSTORE(VEC_SIZE)
- (res3, 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);
- }
+ (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);
}
-#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
+
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
#endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
-#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/direct_convolution5x5.cl
index 70be058..5299409 100644
--- a/src/core/CL/cl_kernels/direct_convolution5x5.cl
+++ b/src/core/CL/cl_kernels/direct_convolution5x5.cl
@@ -194,11 +194,11 @@
__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + ((id2 * STRIDE_Y) - PAD_TOP) * (int)src_stride_z;
weights_addr += id0 * weights_stride_w;
- const int coordy = id2 - PAD_TOP;
+#if(PAD_TOP == 1)
+ const int coordy = id2 - PAD_TOP;
for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
{
-#if(PAD_TOP)
if(coordy < 0) // special case Z = -1 doesn't exists
{
//skip first row and load the two next ones
@@ -224,17 +224,69 @@
CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
}
-#else //PAD_TOP > 0
+ src_addr += src_stride_x;
+ weights_addr += weights_stride_x;
+ }
+#elif(PAD_TOP == 2)
+ const int coordy = id2 * STRIDE_Y;
+ for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+ {
+ if(coordy == 0) // special case Z = -2 doesn't exists
+ {
+ //skip first row and load the two next ones
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+ }
+ else if(coordy == 1) // special case Z = -1 doesn't exists
+ {
+ //skip first row and load the two next ones
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+ }
+ else if(coordy == (SRC_HEIGHT - 1))
+ {
+ // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
+ // Z axis has no padding at all.
+ CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+ }
+ else if(coordy == (SRC_HEIGHT - 2))
+ {
+ // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
+ // Z axis has no padding at all.
+ CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+ }
+ else
+ {
+ CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+ CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+ }
+ src_addr += src_stride_x;
+ weights_addr += weights_stride_x;
+ }
+
+#else /* PAD_TOP == 2 */
+ for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+ {
CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-#endif // PAD_TOP > 0
-
src_addr += src_stride_x;
weights_addr += weights_stride_x;
}
+#endif /* PAD_TOP == 1 */
#ifdef HAS_BIAS
Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
diff --git a/src/core/CL/cl_kernels/flatten.cl b/src/core/CL/cl_kernels/flatten.cl
index df0f9c4..02694f7 100644
--- a/src/core/CL/cl_kernels/flatten.cl
+++ b/src/core/CL/cl_kernels/flatten.cl
@@ -23,12 +23,13 @@
*/
#include "helpers.h"
-#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH)
/** This opencl kernel flattens the first 3 dimensions of the input tensor
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT. e.g. -DSRC_WIDTH=24, -DSRC_HEIGHT=24
+ * @note The width, height and depth of the input tensor must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT and -DSRC_DEPTH. e.g. -DSRC_WIDTH=24, -DSRC_HEIGHT=24, -DSRC_DEPTH=16
+ * @note If the output has 3 dimensions, the 2nd dimension of the output tensor must be passed at compile time using -DDST_DIM1. e.g -DDST_DIM1=3
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -37,20 +38,38 @@
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void flatten(
- TENSOR3D_DECLARATION(src),
- VECTOR_DECLARATION(dst))
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
{
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
- __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * (int)SRC_WIDTH + get_global_id(2) * (int)(SRC_WIDTH * SRC_HEIGHT)) * sizeof(
- DATA_TYPE);
+ uint c = get_global_id(2) % SRC_DEPTH; // input feature map
+ uint b0 = get_global_id(2) / SRC_DEPTH; // batch id
+ uint b1 = 0;
+
+#if defined(DST_DIM1)
+ uint b_tmp = b0;
+ b0 = b_tmp % DST_DIM1; // batch id0
+ b1 = b_tmp / DST_DIM1; // batch id1
+#endif // defined(DST_DIM1)
+
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+ (get_global_id(0) + get_global_id(1) * (uint)SRC_WIDTH + c * (uint)(SRC_WIDTH * SRC_HEIGHT)) * sizeof(DATA_TYPE) +
+ b0 * dst_stride_y +
+ b1 * dst_stride_z;
*((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)src.ptr);
}
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 932e0d6..7de15d0 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -84,7 +84,8 @@
#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block. If -DUNROLL_BLOCK is passed at compile time, the 4x4 block
+ * will be simply unrolled.
*
* @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
@@ -187,6 +188,12 @@
a3 = vload4(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
#endif // defined(REINTERPRET_INPUT_AS_3D)
+#if defined(UNROLL_BLOCK)
+ vstore4(a0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));
+ vstore4(a1, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));
+ vstore4(a2, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));
+ vstore4(a3, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));
+#else // defined(UNROLL_BLOCK)
VEC_DATA_TYPE(DATA_TYPE, 4)
val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);
vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));
@@ -199,6 +206,7 @@
val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3);
vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));
+#endif // defined(UNROLL_BLOCK)
}
#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
@@ -871,6 +879,183 @@
#endif // defined(REINTERPRET_OUTPUT_AS_3D)
}
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) while accumulating the result in a 32 floating point variable.
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
+{
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ int z = get_global_id(2);
+
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+ __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+ // Compute end row address for matrix B
+ __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
+
+ // Reset accumulators
+ float8 c00 = 0.0f;
+ float8 c10 = 0.0f;
+ float8 c20 = 0.0f;
+ float8 c30 = 0.0f;
+
+ for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ float4 a0 = convert_float4(vload4(0, src_addr_a));
+ float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+ c00 += (float8)a0.s0 * b0;
+ c10 += (float8)a0.s1 * b0;
+ c20 += (float8)a0.s2 * b0;
+ c30 += (float8)a0.s3 * b0;
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
+ b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));
+
+ c00 += (float8)a0.s0 * b0;
+ c10 += (float8)a0.s1 * b0;
+ c20 += (float8)a0.s2 * b0;
+ c30 += (float8)a0.s3 * b0;
+ }
+
+ for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ float4 a0 = convert_float4(vload4(0, src_addr_a));
+ float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+ c00 += (float8)a0.s0 * b0;
+ c10 += (float8)a0.s1 * b0;
+ c20 += (float8)a0.s2 * b0;
+ c30 += (float8)a0.s3 * b0;
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+#if defined(ALPHA)
+ // Multiply by the weight of matrix product
+ c00 = c00 * (float8)ALPHA;
+ c10 = c10 * (float8)ALPHA;
+ c20 = c20 * (float8)ALPHA;
+ c30 = c30 * (float8)ALPHA;
+#endif // defined(ALPHA)
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+ uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+ // Store 4x8 block
+ vstore8(convert_half8(c00), 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+ vstore8(convert_half8(c10), 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+ vstore8(convert_half8(c20), 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+ vstore8(convert_half8(c30), 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+ // Store 4x8 block
+ vstore8(convert_half8(c00), 0, (__global half *)(dst_addr + 0 * dst_stride_y));
+ vstore8(convert_half8(c10), 0, (__global half *)(dst_addr + 1 * dst_stride_y));
+ vstore8(convert_half8(c20), 0, (__global half *)(dst_addr + 2 * dst_stride_y));
+ vstore8(convert_half8(c30), 0, (__global half *)(dst_addr + 3 * dst_stride_y));
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+}
+
/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
*
@@ -2291,6 +2476,354 @@
#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
*
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
+{
+ int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+ // Compute starting address for matrix A and Matrix B
+ int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+ // Update address for the matrix A
+ src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+ // Update address for the matrix B
+ src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+ uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+ zin = min(DEPTH_GEMM3D - 1, zin);
+
+ // Add offset due to the cross plane paddings
+ zin *= (src_cross_plane_pad * src0_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply src0_stride_z by DEPTH_GEMM3D
+ src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ float8 acc0 = 0.0h;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ float8 acc1 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ float8 acc2 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ float8 acc3 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ int i = 0;
+ for(; i <= ((int)COLS_A - 4); i += 4)
+ {
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Load values from matrix B
+ float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+ src_addr.s1 += src1_stride_y;
+
+ // Accumulate
+ acc0 = fma(b0, (float8)a0.s0, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (float8)a1.s0, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (float8)a2.s0, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (float8)a3.s0, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+ src_addr.s1 += src1_stride_y;
+ acc0 = fma(b0, (float8)a0.s1, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (float8)a1.s1, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (float8)a2.s1, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (float8)a3.s1, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+ src_addr.s1 += src1_stride_y;
+ acc0 = fma(b0, (float8)a0.s2, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (float8)a1.s2, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (float8)a2.s2, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (float8)a3.s2, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+ src_addr.s1 += src1_stride_y;
+ acc0 = fma(b0, (float8)a0.s3, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (float8)a1.s3, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (float8)a2.s3, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (float8)a3.s3, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ src_addr.s0 += 4 * sizeof(half);
+ }
+
+ for(; i < (int)COLS_A; ++i)
+ {
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Load values from matrix B
+ float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+
+ src_addr += (int2)(sizeof(half), src1_stride_y);
+
+ // Accumulate
+ acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ half8 hacc0 = convert_half8(acc0) * (half8)ALPHA;
+#else //defined(ALPHA)
+ half8 hacc0 = convert_half8(acc0);
+#endif // defined(ALPHA)
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if defined(ALPHA)
+ half8 hacc1 = convert_half8(acc1) * (half8)ALPHA;
+#else //defined(ALPHA)
+ half8 hacc1 = convert_half8(acc1);
+#endif //defined(ALPHA)
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if defined(ALPHA)
+ half8 hacc2 = convert_half8(acc2) * (half8)ALPHA;
+#else //defined(ALPHA)
+ half8 hacc2 = convert_half8(acc2);
+#endif //defined(ALPHA)
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if defined(ALPHA)
+ half8 hacc3 = convert_half8(acc3) * (half8)ALPHA;
+#else //defined(ALPHA)
+ half8 hacc3 = convert_half8(acc3);
+#endif // defined(ALPHA)
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ int z = get_global_id(2);
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+ uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (dst_cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+ // Store the output block
+ vstore8(hacc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ vstore8(hacc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ vstore8(hacc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ vstore8(hacc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+ // Store the output block
+ vstore8(hacc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ vstore8(hacc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ vstore8(hacc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ vstore8(hacc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // REINTERPRET_OUTPUT_AS_3D
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index cd8b269..8c1fa54 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -24,13 +24,13 @@
#include "helpers.h"
#include "helpers_asymm.h"
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
-#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val = arm_dot_acc((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3), val);
-#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
-#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val += arm_dot((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3));
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
#if defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
@@ -40,6 +40,12 @@
* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
*
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -58,13 +64,26 @@
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
*/
__kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
{
- int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
- int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ const int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
+ const int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ const int z = get_global_id(2);
// Offset
const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
@@ -72,9 +91,16 @@
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+ __global uchar *src_addr_a = (__global uchar *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
__global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr_b += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr_b += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
// Compute end row address for matrix B
__global uchar *src_end_addr_b = src_addr_b + COLS_B;
@@ -122,11 +148,49 @@
// Compute destination address
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+ uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
+
// Store 4x4 block
- vstore4(c00, 0, (__global int *)(offset(&dst, 0, 0)));
- vstore4(c10, 0, (__global int *)(offset(&dst, 0, 1)));
- vstore4(c20, 0, (__global int *)(offset(&dst, 0, 2)));
- vstore4(c30, 0, (__global int *)(offset(&dst, 0, 3)));
+ vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
+ vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
+ vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
+ vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst.ptr += z * dst_stride_z;
+
+ // Store 4x4 block
+ vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
+ vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
+ vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
+ vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
}
/** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)
@@ -136,6 +200,12 @@
* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
*
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -154,13 +224,26 @@
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
*/
__kernel void gemmlowp_mm_interleaved_transposed_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
{
- int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
- int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ const int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
+ const int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ const int z = get_global_id(2);
// Offset
const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
@@ -168,9 +251,16 @@
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+ __global uchar *src_addr_a = (__global uchar *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
__global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr_b += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr_b += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
// Compute end row address for matrix B
__global uchar *src_end_addr_b = src_addr_b + COLS_B;
@@ -416,14 +506,52 @@
// Compute destination address
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+ uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
+
// Store 4x4 block
- vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(offset(&dst, 0, 0)));
- vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(offset(&dst, 0, 1)));
- vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(offset(&dst, 0, 2)));
- vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(offset(&dst, 0, 3)));
+ vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
+ vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
+ vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
+ vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst.ptr += z * dst_stride_z;
+
+ // Store 4x4 block
+ vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
+ vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
+ vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
+ vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
}
-#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
/** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
*
@@ -431,6 +559,12 @@
* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
*
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -449,25 +583,38 @@
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
*/
__kernel void gemmlowp_mm_interleaved_transposed_bifrost_dot8(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
{
- int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
- int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
-
// Offset
const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
- __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
+ __global uchar *src_addr_a = (__global uchar *)(src0_ptr + (get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT) * src0_stride_y + get_global_id(2) * src0_stride_z + src0_offset_first_element_in_bytes);
+ __global uchar *src_addr_b = (__global uchar *)(src1_ptr + (get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP) * src1_stride_y + src1_offset_first_element_in_bytes);
- // Compute end row address for matrix B
- __global uchar *src_end_addr_b = src_addr_b + COLS_B;
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr_b += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr_b += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
src_addr_a += offset_row_a;
src_addr_b += offset_row_b;
@@ -477,21 +624,27 @@
uint c01 = 0;
uint c02 = 0;
uint c03 = 0;
+
uint c10 = 0;
uint c11 = 0;
uint c12 = 0;
uint c13 = 0;
+
uint c20 = 0;
uint c21 = 0;
uint c22 = 0;
uint c23 = 0;
+
uint c30 = 0;
uint c31 = 0;
uint c32 = 0;
uint c33 = 0;
+#define COLS_MTX_B (COLS_B / (16 * MULT_TRANSPOSE1XW_WIDTH))
+
#if MULT_INTERLEAVE4X4_HEIGHT == 1
- for(; src_addr_b <= (src_end_addr_b - (int)(32 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += (32 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (32 * TRANSPOSE1XW_WIDTH_STEP))
+ int i = 0;
+ for(; i <= (int)(COLS_MTX_B - 8); i += 8)
{
// Load values from matrix A (interleaved) and matrix B (transposed)
uchar16 a0 = vload16(0, src_addr_a);
@@ -499,95 +652,138 @@
uchar4 b1 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);
uchar4 b2 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);
uchar4 b3 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);
+ uchar4 b4 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);
+ uchar4 b5 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);
+ uchar4 b6 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);
+ uchar4 b7 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);
// Accumulate
- ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s0, b1.s0, b2.s0, b3.s0, c00);
- ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s1, b1.s1, b2.s1, b3.s1, c01);
- ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s2, b1.s2, b2.s2, b3.s2, c02);
- ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s3, b1.s3, b2.s3, b3.s3, c03);
+ ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c00);
+ ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c01);
+ ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c02);
+ ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c03);
- ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s0, b1.s0, b2.s0, b3.s0, c10);
- ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s1, b1.s1, b2.s1, b3.s1, c11);
- ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s2, b1.s2, b2.s2, b3.s2, c12);
- ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s3, b1.s3, b2.s3, b3.s3, c13);
+ ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c10);
+ ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c11);
+ ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c12);
+ ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c13);
- ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s0, b1.s0, b2.s0, b3.s0, c20);
- ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s1, b1.s1, b2.s1, b3.s1, c21);
- ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s2, b1.s2, b2.s2, b3.s2, c22);
- ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s3, b1.s3, b2.s3, b3.s3, c23);
+ ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c20);
+ ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c21);
+ ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c22);
+ ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c23);
- ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s0, b1.s0, b2.s0, b3.s0, c30);
- ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s1, b1.s1, b2.s1, b3.s1, c31);
- ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s2, b1.s2, b2.s2, b3.s2, c32);
- ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s3, b1.s3, b2.s3, b3.s3, c33);
+ ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c30);
+ ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c31);
+ ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c32);
+ ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c33);
- // Load values from matrix A (interleaved) and matrix B (transposed)
+ // Accumulate
a0 = vload16(0, src_addr_a + 16);
- b0 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);
- b1 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);
- b2 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);
- b3 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);
- // Accumulate
- ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s0, b1.s0, b2.s0, b3.s0, c00);
- ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s1, b1.s1, b2.s1, b3.s1, c01);
- ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s2, b1.s2, b2.s2, b3.s2, c02);
- ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s3, b1.s3, b2.s3, b3.s3, c03);
+ ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c00);
+ ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c01);
+ ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c02);
+ ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c03);
- ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s0, b1.s0, b2.s0, b3.s0, c10);
- ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s1, b1.s1, b2.s1, b3.s1, c11);
- ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s2, b1.s2, b2.s2, b3.s2, c12);
- ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s3, b1.s3, b2.s3, b3.s3, c13);
+ ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c10);
+ ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c11);
+ ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c12);
+ ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c13);
- ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s0, b1.s0, b2.s0, b3.s0, c20);
- ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s1, b1.s1, b2.s1, b3.s1, c21);
- ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s2, b1.s2, b2.s2, b3.s2, c22);
- ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s3, b1.s3, b2.s3, b3.s3, c23);
+ ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c20);
+ ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c21);
+ ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c22);
+ ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c23);
- ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s0, b1.s0, b2.s0, b3.s0, c30);
- ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s1, b1.s1, b2.s1, b3.s1, c31);
- ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s2, b1.s2, b2.s2, b3.s2, c32);
- ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s3, b1.s3, b2.s3, b3.s3, c33);
+ ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c30);
+ ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c31);
+ ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c32);
+ ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c33);
+
+ src_addr_a += 32;
+ src_addr_b += 32 * TRANSPOSE1XW_WIDTH_STEP;
}
#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
-
- for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))
+ int i_left_over = 0;
+ for(; i < (int)(COLS_MTX_B); ++i)
{
// Load values from matrix A (interleaved) and matrix B (transposed)
- uchar4 a0 = vload4(0, src_addr_a);
- uchar4 b0 = vload4(0, src_addr_b);
+ uchar16 a0 = vload16(0, src_addr_a + (i_left_over % 4) + ((i_left_over / 4) * 16));
+ uchar4 b0 = vload4(0, src_addr_b);
- c00 += (ushort)a0.s0 * b0.s0;
- c01 += (ushort)a0.s0 * b0.s1;
- c02 += (ushort)a0.s0 * b0.s2;
- c03 += (ushort)a0.s0 * b0.s3;
+ c00 += a0.s0 * b0.s0;
+ c01 += a0.s0 * b0.s1;
+ c02 += a0.s0 * b0.s2;
+ c03 += a0.s0 * b0.s3;
- c10 += (ushort)a0.s1 * b0.s0;
- c11 += (ushort)a0.s1 * b0.s1;
- c12 += (ushort)a0.s1 * b0.s2;
- c13 += (ushort)a0.s1 * b0.s3;
+ c10 += a0.s4 * b0.s0;
+ c11 += a0.s4 * b0.s1;
+ c12 += a0.s4 * b0.s2;
+ c13 += a0.s4 * b0.s3;
- c20 += (ushort)a0.s2 * b0.s0;
- c21 += (ushort)a0.s2 * b0.s1;
- c22 += (ushort)a0.s2 * b0.s2;
- c23 += (ushort)a0.s2 * b0.s3;
+ c20 += a0.s8 * b0.s0;
+ c21 += a0.s8 * b0.s1;
+ c22 += a0.s8 * b0.s2;
+ c23 += a0.s8 * b0.s3;
- c30 += (ushort)a0.s3 * b0.s0;
- c31 += (ushort)a0.s3 * b0.s1;
- c32 += (ushort)a0.s3 * b0.s2;
- c33 += (ushort)a0.s3 * b0.s3;
+ c30 += a0.sC * b0.s0;
+ c31 += a0.sC * b0.s1;
+ c32 += a0.sC * b0.s2;
+ c33 += a0.sC * b0.s3;
+
+ i_left_over++;
+ src_addr_b += 4 * TRANSPOSE1XW_WIDTH_STEP;
}
// Compute destination address
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+ uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst.ptr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
// Store 4x4 block
- vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(offset(&dst, 0, 0)));
- vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(offset(&dst, 0, 1)));
- vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(offset(&dst, 0, 2)));
- vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(offset(&dst, 0, 3)));
+ vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
+ vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
+ vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
+ vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst.ptr += get_global_id(2) * dst_stride_z;
+
+ // Store 4x4 block
+ vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
+ vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
+ vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
+ vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
}
-#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
#endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
@@ -599,6 +795,13 @@
*
* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
*
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -617,10 +820,27 @@
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
*/
__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
{
int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
@@ -633,6 +853,47 @@
// Update address for the matrix B
src_addr.s1 += idx;
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+ uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+ zin = min(DEPTH_GEMM3D - 1, zin);
+
+ // Add offset due to the cross plane paddings
+ zin *= (src_cross_plane_pad * src0_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply src0_stride_z by DEPTH_GEMM3D
+ src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
int end_row_vec_a = src_addr.s0 + COLS_A;
VECTOR_UINT acc0 = 0;
@@ -725,34 +986,95 @@
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
}
+ const int z = get_global_id(2);
+
// Compute destination address
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+ uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (dst_cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
+
// Store the result
VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 0)));
+ (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 1)));
+ (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 2)));
+ (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3)));
+ (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 4)));
+ (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst.ptr += z * dst_stride_z;
+
+ // Store the result
+ VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+ (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+ (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+ (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+ (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+ VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+ (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
}
/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
*
* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
*
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -771,10 +1093,27 @@
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
*/
__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
{
int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
@@ -787,6 +1126,47 @@
// Update address for the matrix B
src_addr.s1 += idx;
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+ uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+ zin = min(DEPTH_GEMM3D - 1, zin);
+
+ // Add offset due to the cross plane paddings
+ zin *= (src_cross_plane_pad * src0_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply src0_stride_z by DEPTH_GEMM3D
+ src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
int end_row_vec_a = src_addr.s0 + COLS_A;
uint acc00 = 0;
@@ -1075,30 +1455,86 @@
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
}
+ const int z = get_global_id(2);
+
// Compute destination address
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+ uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (dst_cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
+
// Store the result
- vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(offset(&dst, 0, 0)));
+ vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(offset(&dst, 0, 1)));
+ vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(offset(&dst, 0, 2)));
+ vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(offset(&dst, 0, 3)));
+ vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4)));
+ vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst.ptr += z * dst_stride_z;
+
+ // Store the result
+ vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+ vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
}
-#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
/** OpenCL kernel optimized to use dot product that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
*
* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
*
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1117,10 +1553,27 @@
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
*/
__kernel void gemmlowp_mm_bifrost_dot8(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D)
+ )
{
int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
@@ -1133,214 +1586,361 @@
// Update address for the matrix B
src_addr.s1 += idx;
- int end_row_vec_a = src_addr.s0 + COLS_A;
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+ uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+ zin = min(DEPTH_GEMM3D - 1, zin);
+
+ // Add offset due to the cross plane paddings
+ zin *= (src_cross_plane_pad * src0_stride_y);
+
+ zin += ((uint4)(0, 1, 2, 3)) * src0_stride_y;
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply src0_stride_z by DEPTH_GEMM3D
+ src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
uint acc00 = 0;
uint acc01 = 0;
uint acc02 = 0;
uint acc03 = 0;
+ uint acc04 = 0;
+ uint acc05 = 0;
+ uint acc06 = 0;
+ uint acc07 = 0;
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
uint acc10 = 0;
uint acc11 = 0;
uint acc12 = 0;
uint acc13 = 0;
+ uint acc14 = 0;
+ uint acc15 = 0;
+ uint acc16 = 0;
+ uint acc17 = 0;
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
uint acc20 = 0;
uint acc21 = 0;
uint acc22 = 0;
uint acc23 = 0;
+ uint acc24 = 0;
+ uint acc25 = 0;
+ uint acc26 = 0;
+ uint acc27 = 0;
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
uint acc30 = 0;
uint acc31 = 0;
uint acc32 = 0;
uint acc33 = 0;
+ uint acc34 = 0;
+ uint acc35 = 0;
+ uint acc36 = 0;
+ uint acc37 = 0;
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- uint acc40 = 0;
- uint acc41 = 0;
- uint acc42 = 0;
- uint acc43 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))
+ // A and B src indices get incremented at the same time.
+ int i = 0;
+ for(; i <= ((int)COLS_A - 8); i += 8)
{
- // Load values from matrix A
- uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A and matrix B
+ uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
+ uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
+ uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
+ uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- // Load values from matrix B
- uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
- uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
- uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
- uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A and matrix B
+ uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
- {
- // Accumulate
- ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a0.s0, a0.s1, a0.s2, a0.s3, acc00);
- ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a0.s0, a0.s1, a0.s2, a0.s3, acc01);
- ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a0.s0, a0.s1, a0.s2, a0.s3, acc02);
- ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a0.s0, a0.s1, a0.s2, a0.s3, acc03);
- }
+ uchar8 b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
+ uchar8 b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
+ uchar8 b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
+ uchar8 b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
+ src_addr.s1 += 4 * src1_stride_y;
+
+ ARM_DOT(a0.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);
+ ARM_DOT(a0.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);
+ ARM_DOT(a0.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);
+ ARM_DOT(a0.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);
+ ARM_DOT(a0.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);
+ ARM_DOT(a0.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);
+ ARM_DOT(a0.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);
+ ARM_DOT(a0.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);
+
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- {
- // Accumulate
- ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a1.s0, a1.s1, a1.s2, a1.s3, acc10);
- ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a1.s0, a1.s1, a1.s2, a1.s3, acc11);
- ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a1.s0, a1.s1, a1.s2, a1.s3, acc12);
- ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a1.s0, a1.s1, a1.s2, a1.s3, acc13);
- }
+ ARM_DOT(a1.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);
+ ARM_DOT(a1.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);
+ ARM_DOT(a1.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);
+ ARM_DOT(a1.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);
+ ARM_DOT(a1.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);
+ ARM_DOT(a1.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);
+ ARM_DOT(a1.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);
+ ARM_DOT(a1.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- {
- // Accumulate
- ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a2.s0, a2.s1, a2.s2, a2.s3, acc20);
- ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a2.s0, a2.s1, a2.s2, a2.s3, acc21);
- ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a2.s0, a2.s1, a2.s2, a2.s3, acc22);
- ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a2.s0, a2.s1, a2.s2, a2.s3, acc23);
- }
+ ARM_DOT(a2.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);
+ ARM_DOT(a2.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);
+ ARM_DOT(a2.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);
+ ARM_DOT(a2.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);
+ ARM_DOT(a2.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);
+ ARM_DOT(a2.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);
+ ARM_DOT(a2.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);
+ ARM_DOT(a2.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- {
- // Accumulate
- ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a3.s0, a3.s1, a3.s2, a3.s3, acc30);
- ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a3.s0, a3.s1, a3.s2, a3.s3, acc31);
- ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a3.s0, a3.s1, a3.s2, a3.s3, acc32);
- ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a3.s0, a3.s1, a3.s2, a3.s3, acc33);
- }
+ ARM_DOT(a3.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);
+ ARM_DOT(a3.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);
+ ARM_DOT(a3.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);
+ ARM_DOT(a3.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);
+ ARM_DOT(a3.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);
+ ARM_DOT(a3.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);
+ ARM_DOT(a3.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);
+ ARM_DOT(a3.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- {
- // Accumulate
- ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a4.s0, a4.s1, a4.s2, a4.s3, acc40);
- ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a4.s0, a4.s1, a4.s2, a4.s3, acc41);
- ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a4.s0, a4.s1, a4.s2, a4.s3, acc42);
- ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a4.s0, a4.s1, a4.s2, a4.s3, acc43);
- }
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+
+ b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
+ b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
+ b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
+ b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
+ src_addr.s1 += 4 * src1_stride_y;
+
+ ARM_DOT(a0.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);
+ ARM_DOT(a0.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);
+ ARM_DOT(a0.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);
+ ARM_DOT(a0.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);
+ ARM_DOT(a0.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);
+ ARM_DOT(a0.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);
+ ARM_DOT(a0.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);
+ ARM_DOT(a0.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ ARM_DOT(a1.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);
+ ARM_DOT(a1.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);
+ ARM_DOT(a1.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);
+ ARM_DOT(a1.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);
+ ARM_DOT(a1.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);
+ ARM_DOT(a1.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);
+ ARM_DOT(a1.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);
+ ARM_DOT(a1.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ ARM_DOT(a2.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);
+ ARM_DOT(a2.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);
+ ARM_DOT(a2.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);
+ ARM_DOT(a2.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);
+ ARM_DOT(a2.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);
+ ARM_DOT(a2.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);
+ ARM_DOT(a2.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);
+ ARM_DOT(a2.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ ARM_DOT(a3.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);
+ ARM_DOT(a3.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);
+ ARM_DOT(a3.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);
+ ARM_DOT(a3.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);
+ ARM_DOT(a3.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);
+ ARM_DOT(a3.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);
+ ARM_DOT(a3.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);
+ ARM_DOT(a3.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ src_addr.s0 += 8;
}
- for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
+ for(; i < (int)COLS_A; ++i)
{
+#if defined(REINTERPRET_INPUT_AS_3D)
// Load values from matrix A
- uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
+ uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
+ uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
+ uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
+ uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
// Load values from matrix B
- uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);
+ uchar8 b0 = vload8(0, src1_ptr + src_addr.s1);
+ src_addr.s1 += src1_stride_y;
- // Accumulate
- {
- // Accumulate
- ushort tmp0 = (ushort)b0.s0 * (ushort)a0;
- ushort tmp1 = (ushort)b0.s1 * (ushort)a0;
- ushort tmp2 = (ushort)b0.s2 * (ushort)a0;
- ushort tmp3 = (ushort)b0.s3 * (ushort)a0;
+ acc00 += (uint)a0 * b0.s0;
+ acc01 += (uint)a0 * b0.s1;
+ acc02 += (uint)a0 * b0.s2;
+ acc03 += (uint)a0 * b0.s3;
+ acc04 += (uint)a0 * b0.s4;
+ acc05 += (uint)a0 * b0.s5;
+ acc06 += (uint)a0 * b0.s6;
+ acc07 += (uint)a0 * b0.s7;
- acc00 += ((uint)tmp0);
- acc01 += ((uint)tmp1);
- acc02 += ((uint)tmp2);
- acc03 += ((uint)tmp3);
- }
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- {
- // Accumulate
- ushort tmp0 = (ushort)b0.s0 * (ushort)a1;
- ushort tmp1 = (ushort)b0.s1 * (ushort)a1;
- ushort tmp2 = (ushort)b0.s2 * (ushort)a1;
- ushort tmp3 = (ushort)b0.s3 * (ushort)a1;
-
- acc10 += ((uint)tmp0);
- acc11 += ((uint)tmp1);
- acc12 += ((uint)tmp2);
- acc13 += ((uint)tmp3);
- }
+ acc10 += (uint)a1 * b0.s0;
+ acc11 += (uint)a1 * b0.s1;
+ acc12 += (uint)a1 * b0.s2;
+ acc13 += (uint)a1 * b0.s3;
+ acc14 += (uint)a1 * b0.s4;
+ acc15 += (uint)a1 * b0.s5;
+ acc16 += (uint)a1 * b0.s6;
+ acc17 += (uint)a1 * b0.s7;
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- {
- // Accumulate
- ushort tmp0 = (ushort)b0.s0 * (ushort)a2;
- ushort tmp1 = (ushort)b0.s1 * (ushort)a2;
- ushort tmp2 = (ushort)b0.s2 * (ushort)a2;
- ushort tmp3 = (ushort)b0.s3 * (ushort)a2;
-
- acc20 += ((uint)tmp0);
- acc21 += ((uint)tmp1);
- acc22 += ((uint)tmp2);
- acc23 += ((uint)tmp3);
- }
+ acc20 += (uint)a2 * b0.s0;
+ acc21 += (uint)a2 * b0.s1;
+ acc22 += (uint)a2 * b0.s2;
+ acc23 += (uint)a2 * b0.s3;
+ acc24 += (uint)a2 * b0.s4;
+ acc25 += (uint)a2 * b0.s5;
+ acc26 += (uint)a2 * b0.s6;
+ acc27 += (uint)a2 * b0.s7;
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- {
- // Accumulate
- ushort tmp0 = (ushort)b0.s0 * (ushort)a3;
- ushort tmp1 = (ushort)b0.s1 * (ushort)a3;
- ushort tmp2 = (ushort)b0.s2 * (ushort)a3;
- ushort tmp3 = (ushort)b0.s3 * (ushort)a3;
-
- acc30 += ((uint)tmp0);
- acc31 += ((uint)tmp1);
- acc32 += ((uint)tmp2);
- acc33 += ((uint)tmp3);
- }
+ acc30 += (uint)a3 * b0.s0;
+ acc31 += (uint)a3 * b0.s1;
+ acc32 += (uint)a3 * b0.s2;
+ acc33 += (uint)a3 * b0.s3;
+ acc34 += (uint)a3 * b0.s4;
+ acc35 += (uint)a3 * b0.s5;
+ acc36 += (uint)a3 * b0.s6;
+ acc37 += (uint)a3 * b0.s7;
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- {
- // Accumulate
- ushort tmp0 = (ushort)b0.s0 * (ushort)a4;
- ushort tmp1 = (ushort)b0.s1 * (ushort)a4;
- ushort tmp2 = (ushort)b0.s2 * (ushort)a4;
- ushort tmp3 = (ushort)b0.s3 * (ushort)a4;
- acc40 += ((uint)tmp0);
- acc41 += ((uint)tmp1);
- acc42 += ((uint)tmp2);
- acc43 += ((uint)tmp3);
- }
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+ src_addr.s0 += 1;
}
+ int z = get_global_id(2);
+
// Compute destination address
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+ // Compute dst address
+ __global uchar *dst_addr = dst.ptr;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+ uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (dst_cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
// Store the result
- vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(offset(&dst, 0, 0)));
+ vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));
+ vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(offset(&dst, 0, 1)));
+ vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));
+ vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(offset(&dst, 0, 2)));
+ vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));
+ vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(offset(&dst, 0, 3)));
+ vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));
+ vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4)));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-}
-#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+ // Store the result
+ vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y));
+ vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y));
+ vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y));
+ vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y));
+ vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+}
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
#if defined(COLS_A)
@@ -1398,6 +1998,70 @@
*((__global int *)dst.ptr) = (int)sum_row;
}
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
+ IMAGE_DECLARATION(dst))
+{
+ // Compute source and destination addresses
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ uint sum_row = 0;
+
+ __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
+
+ int i = 0;
+
+ // This for loop performs 16 accumulations
+ for(; i <= ((int)COLS_A - 32); i += 32)
+ {
+ uchar16 a0_u8 = vload16(0, matrix_a + i);
+
+ sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));
+ sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));
+ sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));
+ sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));
+
+ a0_u8 = vload16(1, matrix_a + i);
+
+ sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));
+ sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));
+ sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));
+ sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));
+ }
+
+ // This for loop performs the leftover accumulations
+ for(; i < COLS_A; ++i)
+ {
+ sum_row += matrix_a[i];
+ }
+
+ *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
#endif // defined(COLS_A)
#if defined(COLS_B) && defined(ROWS_B)
@@ -1463,6 +2127,101 @@
#endif // defined(COLS_B) && defined(ROWS_B)
#if defined(K_OFFSET)
+
+/* Helper function used to calculate the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel.
+ *
+ * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
+ * and calculates the offset contribution of matrix A and matrix B.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * @param[in] x get_global_id(0) * 4
+ * @param[in] y get_global_id(1)
+ * @param[in] z get_global_id(2)
+ * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ */
+inline int4 offset_contribution(
+ int x,
+ int y,
+ int z
+#if defined(A_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+ ,
+ VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+)
+{
+ int4 a_offset_s32 = (int4)0;
+ int4 b_offset_s32 = (int4)0;
+
+ int batch_id = z;
+#if defined(DEPTH_INPUT3D)
+ batch_id /= (int)DEPTH_INPUT3D;
+#endif // defined(DEPTH_INPUT3D)
+
+#if defined(A_OFFSET)
+ // Compute the offset contribution due to A_OFFSET
+ __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
+
+ // Compute the offset contribution due to A_OFFSET
+#if defined(SUM_COL_HAS_BATCHES)
+ a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
+#else // defined(SUM_COL_HAS_BATCHES)
+ a_offset_s32 = vload4(0, (__global int *)sum_col_addr);
+#endif // defined(SUM_COL_HAS_BATCHES)
+
+ a_offset_s32 *= (int4)A_OFFSET;
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+ // Compute the offset contribution due to A_OFFSET
+ __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
+
+ // Compute the offset contribution due to B_OFFSET
+#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+ b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
+#else // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+ b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
+#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+ b_offset_s32 *= (int4)B_OFFSET;
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+ // Add bias
+ __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+ int4 biases_values = vload4(0, (__global int *)bias_addr);
+ b_offset_s32 += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+ return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
+}
+
/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
*
* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
@@ -1480,26 +2239,30 @@
* (sum_row[i] * B_OFFSET) +
* (K_OFFSET)
*
- * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
- * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] sum_col_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in] sum_col_result_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] sum_col_result_step_x sum_col_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_col_result_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] sum_col_result_step_y sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] sum_row_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in] sum_row_result_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] sum_row_result_step_x sum_row_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_row_result_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] sum_row_result_step_y sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
+ * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
*/
__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
#if defined(A_OFFSET)
@@ -1510,44 +2273,348 @@
,
IMAGE_DECLARATION(sum_row)
#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+ ,
+ VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS))
)
{
- Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);
+ const int x = get_global_id(0) * 4;
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
- int4 a_offset_s32 = (int4)0;
- int4 b_offset_s32 = (int4)0;
-
+ // Compute offset contribution
+ int4 offset_term_s32 = offset_contribution(
+ x, y, z
#if defined(A_OFFSET)
- Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);
-
- // Compute the offset contribution due to A_OFFSET
-#if defined(SUM_COL_HAS_BATCHES)
- a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr + get_global_id(2) * sum_col_stride_y));
-#else // defined(MATRIX_B_HAS_BATCHES)
- a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr));
-#endif // defined(MATRIX_B_HAS_BATCHES)
-
- a_offset_s32 *= (int4)A_OFFSET;
+ ,
+ sum_col_ptr,
+ sum_col_stride_x,
+ sum_col_step_x,
+ sum_col_stride_y,
+ sum_col_step_y,
+ sum_col_offset_first_element_in_bytes
#endif // defined(A_OFFSET)
-
#if defined(B_OFFSET)
- Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);
-
- // Compute the offset contribution due to B_OFFSET
- b_offset_s32 = (int4) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));
- b_offset_s32 *= (int4)B_OFFSET;
+ ,
+ sum_row_ptr,
+ sum_row_stride_x,
+ sum_row_step_x,
+ sum_row_stride_y,
+ sum_row_step_y,
+ sum_row_offset_first_element_in_bytes
#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+ ,
+ biases_ptr,
+ biases_stride_x,
+ biases_step_x,
+ biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+ );
- const int4 offset_term_s32 = (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
+ __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
- int4 in_s32 = vload4(0, (__global int *)mm_result.ptr);
+ int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
// Add the offset terms to GEMM's result
in_s32 += offset_term_s32;
// Store the result with the offset contribution
- vstore4(in_s32, 0, (__global int *)mm_result.ptr);
+ vstore4(in_s32, 0, (__global int *)mm_result_addr);
}
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ * (sum_col[k] * A_OFFSET) +
+ * (sum_row[i] * B_OFFSET) +
+ * (K_OFFSET)
+ *
+ * This result is quantized down to uint8 using the output stage. The output stage computes the following operations:
+ *
+ * -# Add offset terms to final result
+ * -# Multiply each entry of result by result_mult_int
+ * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ * -# Shift the int32 accumulator by result_shift
+ * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
+ * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ * These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
+ * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+ ,
+#if defined(ADD_BIAS)
+ VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+ TENSOR3D_DECLARATION(dst))
+{
+ const int x = get_global_id(0) * 4;
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+ // Compute offset contribution
+ int4 offset_term_s32 = offset_contribution(
+ x, y, z
+#if defined(A_OFFSET)
+ ,
+ sum_col_ptr,
+ sum_col_stride_x,
+ sum_col_step_x,
+ sum_col_stride_y,
+ sum_col_step_y,
+ sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ sum_row_ptr,
+ sum_row_stride_x,
+ sum_row_step_x,
+ sum_row_stride_y,
+ sum_row_step_y,
+ sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+ ,
+ biases_ptr,
+ biases_stride_x,
+ biases_step_x,
+ biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+ );
+
+ __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+ int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+ // Add the offset terms to GEMM's result
+ in_s32 += offset_term_s32;
+
+ // -------------- OUTPUT STAGE
+
+ // Add the offset terms to GEMM's result
+ in_s32 += (int4)RESULT_OFFSET;
+
+ // Multiply by result_mult_int and shift
+ in_s32 *= RESULT_MULTIPLIER;
+
+ in_s32 >>= RESULT_SHIFT;
+
+ uchar4 res = convert_uchar4_sat(in_s32);
+
+#if defined(MIN_BOUND)
+ res = max(res, (uchar4)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+ res = min(res, (uchar4)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+ // Store the result
+ vstore4(res, 0, dst_addr);
+}
+
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ * (sum_col[k] * A_OFFSET) +
+ * (sum_row[i] * B_OFFSET) +
+ * (K_OFFSET)
+ *
+ * This result is quantized down to uint8 using the output stage. The output stage computes the following operations:
+ *
+ * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ * -# Add bias to final result if bias tensor is not a nullptr
+ * -# Round to nearest division by a power-of-two using result_shift
+ * -# Add offset to each result
+ * -# Clamp the value between the specified min and max bounds
+ * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ * These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
+ * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+ ,
+#if defined(ADD_BIAS)
+ VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+ TENSOR3D_DECLARATION(dst))
+{
+ const int x = get_global_id(0) * 4;
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
+
+ // Compute offset contribution
+ int4 offset_term_s32 = offset_contribution(
+ x, y, z
+#if defined(A_OFFSET)
+ ,
+ sum_col_ptr,
+ sum_col_stride_x,
+ sum_col_step_x,
+ sum_col_stride_y,
+ sum_col_step_y,
+ sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ sum_row_ptr,
+ sum_row_stride_x,
+ sum_row_step_x,
+ sum_row_stride_y,
+ sum_row_step_y,
+ sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+ ,
+ biases_ptr,
+ biases_stride_x,
+ biases_step_x,
+ biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+ );
+
+ __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+ int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+ // Add the offset terms to GEMM's result
+ in_s32 += offset_term_s32;
+
+ // -------------- OUTPUT STAGE
+
+ // Multiply by result_mult_int and shift
+ in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+
+ // Add the offset terms to GEMM's result
+ in_s32 += (int4)RESULT_OFFSET;
+
+ uchar4 res = convert_uchar4_sat(in_s32);
+
+#if defined(MIN_BOUND)
+ res = max(res, (uchar4)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+ res = min(res, (uchar4)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+ // Store the result
+ vstore4(res, 0, dst_addr);
+}
+#endif // defined(K_OFFSET) && defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
#endif // defined(K_OFFSET)
#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
@@ -1577,10 +2644,10 @@
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1597,39 +2664,43 @@
TENSOR3D_DECLARATION(dst))
{
// Compute source and destination addresses
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-#if defined(ADD_BIAS)
- Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
-#endif // defined(ADD_BIAS)
+ int x = get_global_id(0) * 4;
+ int y = get_global_id(1);
+ int z = get_global_id(2);
- int16 input_values = vload16(0, (__global int *)src.ptr);
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
- // Add the offset terms to GEMM's result
- input_values += (int16)RESULT_OFFSET;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+ int4 input_values = vload4(0, (__global int *)src_addr);
#if defined(ADD_BIAS)
// Add bias
- const int16 biases_values = vload16(0, (__global int *)biases.ptr);
- input_values += (int16)biases_values;
+ __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+ int4 biases_values = vload4(0, (__global int *)bias_addr);
+ input_values += (int4)biases_values;
#endif // defined(ADD_BIAS)
+ // Add the offset terms to GEMM's result
+ input_values += (int4)RESULT_OFFSET;
+
// Multiply by result_mult_int and shift
input_values *= RESULT_MULT_INT;
input_values >>= RESULT_SHIFT;
- uchar16 res = convert_uchar16_sat(input_values);
+ uchar4 res = convert_uchar4_sat(input_values);
#if defined(MIN_BOUND)
- res = max(res, (uchar16)MIN_BOUND);
+ res = max(res, (uchar4)MIN_BOUND);
#endif // defined(MIN_BOUND)
#if defined(MAX_BOUND)
- res = min(res, (uchar16)MAX_BOUND);
+ res = min(res, (uchar4)MAX_BOUND);
#endif // defined(MAX_BOUND)
// Store the result
- vstore16(res, 0, dst.ptr);
+ vstore4(res, 0, dst_addr);
}
#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
@@ -1646,7 +2717,92 @@
* -# Clamp the value between the specified min and max bounds
* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
*
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ * These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+ VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+ TENSOR3D_DECLARATION(dst))
+{
+ // Compute source and destination addresses
+ int x = get_global_id(0) * 4;
+ int y = get_global_id(1);
+ int z = get_global_id(2);
+
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+ int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+ // Add bias
+ __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+ int4 biases_values = vload4(0, (__global int *)bias_addr);
+ input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+ // Multiply by result_mult_int and shift
+ input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+
+ // Add the offset terms to GEMM's result
+ input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
+
+ uchar4 res = convert_uchar4_sat(input_values);
+
+#if defined(MIN_BOUND)
+ res = max(res, (uchar4)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+ res = min(res, (uchar4)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+ // Store the result
+ vstore4(res, 0, dst_addr);
+}
+#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ * -# Add bias to final result if bias tensor is not a nullptr
+ * -# Requantize
+ * -# Add offset to each result
+ * -# Clamp the value between the specified min and max bounds
+ * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER
*
* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
@@ -1671,45 +2827,53 @@
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
-__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
+__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
#if defined(ADD_BIAS)
- VECTOR_DECLARATION(biases),
+ VECTOR_DECLARATION(biases),
#endif // defined(ADD_BIAS)
- TENSOR3D_DECLARATION(dst))
+#if defined(DST_HEIGHT)
+ TENSOR4D_DECLARATION(dst))
+#else // defined(DST_HEIGHT)
+ TENSOR3D_DECLARATION(dst))
+#endif // defined(DST_HEIGHT)
{
// Compute source and destination addresses
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-#if defined(ADD_BIAS)
- Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
-#endif // defined(ADD_BIAS)
+ int x = get_global_id(0) * 4;
+ int y = get_global_id(1);
+ int z = get_global_id(2);
- int16 input_values = vload16(0, (__global int *)src.ptr);
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+ int4 input_values = vload4(0, (__global int *)src_addr);
#if defined(ADD_BIAS)
// Add bias
- const int16 biases_values = vload16(0, (__global int *)biases.ptr);
- input_values += (int16)biases_values;
+ __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+ int4 biases_values = vload4(0, (__global int *)bias_addr);
+ input_values += (int4)biases_values;
#endif // defined(ADD_BIAS)
- // Multiply by result_mult_int and shift
- input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 16);
+ // Convert to float
+ float16 input_values_f = convert_float4(input_values);
+ input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
- // Add the offset terms to GEMM's result
- input_values += (int16)RESULT_OFFSET_AFTER_SHIFT;
-
- uchar16 res = convert_uchar16_sat(input_values);
+ uchar4 res = convert_uchar4_sat(input_values_f);
#if defined(MIN_BOUND)
- res = max(res, (uchar16)MIN_BOUND);
+ res = max(res, (uchar4)MIN_BOUND);
#endif // defined(MIN_BOUND)
#if defined(MAX_BOUND)
- res = min(res, (uchar16)MAX_BOUND);
+ res = min(res, (uchar4)MAX_BOUND);
#endif // defined(MAX_BOUND)
// Store the result
- vstore16(res, 0, dst.ptr);
+ vstore4(res, 0, dst_addr);
}
-#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
diff --git a/src/core/CL/cl_kernels/generate_proposals.cl b/src/core/CL/cl_kernels/generate_proposals.cl
new file mode 100644
index 0000000..bc6f4b5
--- /dev/null
+++ b/src/core/CL/cl_kernels/generate_proposals.cl
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Generate all the region of interests based on the image size and the anchors passed in. For each element (x,y) of the
+ * grid, it will generate NUM_ANCHORS rois, given by shifting the grid position to match the anchor.
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
+ * -# -DHEIGHT= Height of the feature map on which this kernel is applied
+ * -# -DWIDTH= Width of the feature map on which this kernel is applied
+ * -# -DNUM_ANCHORS= Number of anchors to be used to generate the rois per each pixel
+ * -# -DSTRIDE= Stride to be applied at each different pixel position (i.e., x_range = (1:WIDTH)*STRIDE and y_range = (1:HEIGHT)*STRIDE
+ * -# -DNUM_ROI_FIELDS= Number of fields used to represent a roi
+ *
+ * @param[in] anchors_ptr Pointer to the anchors tensor. Supported data types: F16/F32
+ * @param[in] anchors_stride_x Stride of the anchors tensor in X dimension (in bytes)
+ * @param[in] anchors_step_x anchors_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] anchors_stride_y Stride of the anchors tensor in Y dimension (in bytes)
+ * @param[in] anchors_step_y anchors_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] anchors_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] anchors_step_z anchors_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] anchors_offset_first_element_in_bytes The offset of the first element in the boxes tensor
+ * @param[out] rois_ptr Pointer to the rois. Supported data types: same as @p in_ptr
+ * @param[out] rois_stride_x Stride of the rois in X dimension (in bytes)
+ * @param[out] rois_step_x pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[out] rois_stride_y Stride of the rois in Y dimension (in bytes)
+ * @param[out] rois_step_y pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[out] rois_stride_z Stride of the rois in Z dimension (in bytes)
+ * @param[out] rois_step_z pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[out] rois_offset_first_element_in_bytes The offset of the first element in the rois
+ */
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
+__kernel void generate_proposals_compute_all_anchors(
+ VECTOR_DECLARATION(anchors),
+ VECTOR_DECLARATION(rois))
+{
+ Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
+ Vector rois = CONVERT_TO_VECTOR_STRUCT(rois);
+
+ const size_t idx = get_global_id(0);
+ // Find the index of the anchor
+ const size_t anchor_idx = idx % NUM_ANCHORS;
+
+ // Find which shift is this thread using
+ const size_t shift_idx = idx / NUM_ANCHORS;
+
+ // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
+ const DATA_TYPE
+ shift_x = (DATA_TYPE)(shift_idx % WIDTH) * STRIDE;
+ const DATA_TYPE
+ shift_y = (DATA_TYPE)(shift_idx / WIDTH) * STRIDE;
+
+ const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+ shift = (VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
+
+ // Read the given anchor
+ const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+ anchor = vload4(0, (__global DATA_TYPE *)vector_offset(&anchors, anchor_idx * NUM_ROI_FIELDS));
+
+ // Apply the shift to the anchor
+ const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+ shifted_anchor = anchor + shift;
+
+ vstore4(shifted_anchor, 0, (__global DATA_TYPE *)rois.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 3f7a2a5..7ee97d9 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -24,23 +24,21 @@
#ifndef ARM_COMPUTE_HELPER_H
#define ARM_COMPUTE_HELPER_H
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-#if defined(cl_arm_printf)
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
#pragma OPENCL EXTENSION cl_arm_printf : enable
-#endif // defined(cl_arm_printf)
-#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
#define EXPAND(x) x
@@ -185,7 +183,7 @@
*
* @return An image object
*/
-Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
{
Vector vector =
{
@@ -208,7 +206,7 @@
*
* @return An image object
*/
-Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
{
Image img =
{
@@ -234,7 +232,7 @@
*
* @return A 3D tensor object
*/
-Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
{
Image img =
{
@@ -260,7 +258,7 @@
*
* @return A 3D tensor object
*/
-Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
{
Tensor3D tensor =
{
@@ -274,7 +272,7 @@
return tensor;
}
-Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
uint step_w,
uint mod_size)
{
@@ -297,7 +295,7 @@
* @param[in] vec Pointer to the starting position of the buffer
* @param[in] x Relative X position
*/
-__global inline const uchar *vector_offset(const Vector *vec, int x)
+inline __global const uchar *vector_offset(const Vector *vec, int x)
{
return vec->ptr + x * vec->stride_x;
}
@@ -308,7 +306,7 @@
* @param[in] x Relative X position
* @param[in] y Relative Y position
*/
-__global inline uchar *offset(const Image *img, int x, int y)
+inline __global uchar *offset(const Image *img, int x, int y)
{
return img->ptr + x * img->stride_x + y * img->stride_y;
}
@@ -320,7 +318,7 @@
* @param[in] y Relative Y position
* @param[in] z Relative Z position
*/
-__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
{
return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
}
@@ -333,7 +331,7 @@
* @param[in] z Relative Z position
* @param[in] w Relative W position
*/
-__global inline const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
{
return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
}
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index a69bcc1..c314d17 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -62,6 +62,7 @@
b_64 = convert_long##size(b); \
VEC_DATA_TYPE(long, size) \
ab_64 = a_64 * b_64; \
+ /* COMPMID-907 */ \
VEC_DATA_TYPE(int, size) \
ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \
return select(ab_x2_high32, INT_MAX, overflow); \
@@ -366,4 +367,4 @@
ASYMM_RESCALE_IMPL(8)
ASYMM_RESCALE_IMPL(16)
-#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/l2_normalize.cl
index f58e98b..5f66efb 100644
--- a/src/core/CL/cl_kernels/l2_normalize.cl
+++ b/src/core/CL/cl_kernels/l2_normalize.cl
@@ -23,7 +23,7 @@
*/
#include "helpers.h"
-/** This kernel performs reduction given an operation.
+/** This kernel performs l2 normalization on x-axis
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
@@ -42,7 +42,7 @@
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] epsilon Epsilon value
*/
-__kernel void l2_normalize(
+__kernel void l2_normalize_x(
VECTOR_DECLARATION(src),
VECTOR_DECLARATION(sum),
VECTOR_DECLARATION(dst),
@@ -55,7 +55,104 @@
VEC_DATA_TYPE(DATA_TYPE, 16)
in = vload16(0, (__global DATA_TYPE *)src.ptr);
VEC_DATA_TYPE(DATA_TYPE, 16)
- normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))native_rsqrt(fmax(((__global DATA_TYPE *)sum.ptr)[0], epsilon));
+ normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(((__global DATA_TYPE *)sum.ptr)[0], epsilon));
+
+ vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** This kernel performs l2 normalization on y-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] sum_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] epsilon Epsilon value
+ */
+__kernel void l2_normalize_y(
+ IMAGE_DECLARATION(src),
+ IMAGE_DECLARATION(sum),
+ IMAGE_DECLARATION(dst),
+ DATA_TYPE epsilon)
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ in = vload16(0, (__global DATA_TYPE *)src.ptr);
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ sums = vload16(0, (__global DATA_TYPE *)sum.ptr);
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(sums, epsilon));
+
+ vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
+}
+/** This kernel performs l2 normalization on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] sum_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] epsilon Epsilon value
+ */
+__kernel void l2_normalize_z(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(sum),
+ TENSOR3D_DECLARATION(dst),
+ DATA_TYPE epsilon)
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D sum = CONVERT_TO_TENSOR3D_STRUCT(sum);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ in = vload16(0, (__global DATA_TYPE *)src.ptr);
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ sums = vload16(0, (__global DATA_TYPE *)sum.ptr);
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(sums, epsilon));
vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/memset.cl b/src/core/CL/cl_kernels/memset.cl
new file mode 100644
index 0000000..80b34eb
--- /dev/null
+++ b/src/core/CL/cl_kernels/memset.cl
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(CONSTANT_VALUE) // Check for compile time constants
+
+/** Fill the tensor's planes with all value
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DCONSTANT_VALUE = The value use to fill the tensor's planes
+ * -# -DVEC_SIZE = Vector size
+ * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in] tensor_ptr Pointer to the source image. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] tensor_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] tensor_step_x tensor_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] tensor_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] tensor_step_y tensor_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] tensor_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] value The value used to fill the pages of the tensor
+ */
+__kernel void memset(
+ IMAGE_DECLARATION(tensor))
+{
+ Image tensor = CONVERT_TO_IMAGE_STRUCT(tensor);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi = (int)(get_global_id(0) * VEC_SIZE);
+ tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x;
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data = (DATA_TYPE)(CONSTANT_VALUE);
+
+ VSTORE(VEC_SIZE)
+ (data, 0, (__global DATA_TYPE *)tensor.ptr);
+#else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+ *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE);
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+
+#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
index dbdad27..0b6df39 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/normalization_layer.cl
@@ -92,6 +92,7 @@
STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
}
+#if defined(WIDTH_SIZE)
/** Apply in-map normalization.
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
@@ -133,7 +134,7 @@
const int current_col = get_global_id(0) << 2;
const int left_pos = max(-(int)RADIUS, -3 - current_col);
- const int right_pos = min((int)RADIUS, (int)((get_global_size(0) << 2) + 3 - 1 - current_col));
+ const int right_pos = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col);
#if defined(IN_MAP_2D)
const int current_row = get_global_id(1);
@@ -168,3 +169,4 @@
STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
}
+#endif // defined(WIDTH_SIZE)
diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl
new file mode 100644
index 0000000..a105968
--- /dev/null
+++ b/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ *
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes)
+ * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(std))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector std = CONVERT_TO_VECTOR_STRUCT(std);
+
+ const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+ const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)));
+ const DATA_TYPE curr_std = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)));
+
+ TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+ TYPE res = (data - curr_mean) / curr_std;
+
+ VSTORE(VEC_SIZE)
+ (res, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ *
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes)
+ * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nhwc(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(std))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector std = CONVERT_TO_VECTOR_STRUCT(std);
+
+ const uint current_slice = get_global_id(0);
+
+ const TYPE curr_mean = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE)));
+ const TYPE curr_std = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(std.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE)));
+
+ TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+ TYPE res = (data - curr_mean) / curr_std;
+
+ VSTORE(VEC_SIZE)
+ (res, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
new file mode 100644
index 0000000..925975d
--- /dev/null
+++ b/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSET_FLT ((float)OFFSET)
+#define SCALE_FLT ((float)SCALE)
+
+#if defined(NUM_CHANNELS)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ *
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes)
+ * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(std))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector std = CONVERT_TO_VECTOR_STRUCT(std);
+
+ const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+ float16 curr_mean_flt = (float16)(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))));
+ curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+ float16 curr_std_flt = (float16)(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))));
+ curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+ float16 data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), float16);
+ data_flt = round(data_flt - OFFSET_FLT) * SCALE_FLT;
+
+ // Perform normalization
+ float16 res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+ const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+ VSTORE(VEC_SIZE)
+ (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+#endif // defined(NUM_CHANNELS)
+
+/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ *
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes)
+ * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nhwc(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(std))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector std = CONVERT_TO_VECTOR_STRUCT(std);
+
+ const uint current_slice = get_global_id(0);
+
+ float16 curr_mean_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE))), float16);
+ curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+ float16 curr_std_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(std.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE))), float16);
+ curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+ float16 data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), float16);
+ data_flt = round(data_flt - OFFSET_FLT) * (SCALE_FLT);
+
+ // Perform normalization
+ float16 res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+ const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+ VSTORE(VEC_SIZE)
+ (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
index f4f36a0..9fa540e 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,6 +30,7 @@
#endif /* SATURATE */
#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT)
/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
*
* @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
@@ -93,3 +94,4 @@
// Store result
vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
}
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
index c99a08a..5b3acb7 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
@@ -32,6 +32,7 @@
#define MUL_OP(x, y, scale, type, size) CONVERT_OP_INT((x) * (y) >> scale, type, size)
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT)
/** Performs a pixelwise multiplication with integer scale of integer inputs.
*
* @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
@@ -85,3 +86,70 @@
// Perform multiplication and store result
vstore16(MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
}
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT) */
+
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
+/** Performs a pixelwise multiplication with float scale of quantized inputs.
+ *
+ * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, e.g. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, e.g. -DOFFSET_IN2=10
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, e.g. -DOFFSET_OUT=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, e.g. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, e.g. -DSCALE_IN2=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, e.g. -DSCALE_OUT=10
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16, F16, F32
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] scale Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_quantized(
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out),
+ const float scale)
+{
+ // Get pixels pointer
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Load data
+ int16 in_a = CONVERT(vload16(0, (__global uchar *)in1.ptr), int16);
+ int16 in_b = CONVERT(vload16(0, (__global uchar *)in2.ptr), int16);
+
+ // Dequantize
+ in_a -= (int16)(int)OFFSET_IN1;
+ in_b -= (int16)(int)OFFSET_IN2;
+ const float16 in1f32 = convert_float16(in_a) * (float16)(float)SCALE_IN1;
+ const float16 in2f32 = convert_float16(in_b) * (float16)(float)SCALE_IN2;
+
+ const float16 qresf32 = (in1f32 * in2f32 * scale) / ((float16)(float)SCALE_OUT) + ((float16)((float16)OFFSET_OUT));
+ const uchar16 res = convert_uchar16_sat(convert_int16_rte(qresf32));
+
+ // Store result
+ vstore16(res, 0, (__global uchar *)out.ptr);
+}
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 0808353..7d15d10 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -489,7 +489,11 @@
const int pad_x, const int pad_y, const int stride_x, const int stride_y)
{
int start_x = get_global_id(1) * stride_x - pad_x;
+#if defined(DST_DEPTH)
+ int start_y = (get_global_id(2) % DST_DEPTH) * stride_y - pad_y;
+#else /* defined(DST_DEPTH) */
int start_y = get_global_id(2) * stride_y - pad_y;
+#endif /* defined(DST_DEPTH) */
#if !defined(EXCLUDE_PADDING)
upper_bound_w += pad_x;
@@ -522,30 +526,43 @@
* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
*/
__kernel void pooling_layer_MxN_nhwc(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
{
// Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#if defined(DST_DEPTH)
+ Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
+ Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+#else /* defined(DST_DEPTH) */
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* defined(DST_DEPTH) */
VEC_DATA_TYPE(DATA_TYPE, 8)
vdata = INITIAL_VALUE;
DATA_TYPE sdata = INITIAL_VALUE;
- const int idx_width = get_global_id(1) * STRIDE_X;
+ const int idx_width = get_global_id(1) * STRIDE_X;
+#if defined(DST_DEPTH)
+ const int idx_height = (get_global_id(2) % DST_DEPTH) * STRIDE_Y;
+#else /* defined(DST_DEPTH) */
const int idx_height = get_global_id(2) * STRIDE_Y;
+#endif /* defined(DST_DEPTH) */
for(int y = 0; y < POOL_SIZE_Y; ++y)
{
@@ -555,8 +572,14 @@
int x1 = select(x, PAD_X - idx_width - 1, x + idx_width - PAD_X < 0 || x + idx_width - PAD_X >= MAX_WIDTH);
x1 = select(x1, PAD_X - idx_width - 1, y != y1);
+#if defined(DST_DEPTH)
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ data0 = vload8(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
+#else /* defined(DST_DEPTH) */
VEC_DATA_TYPE(DATA_TYPE, 8)
data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+#endif /* defined(DST_DEPTH) */
+
#if defined(POOL_L2)
// Raise to power of 2 for L2 Pooling
data0 *= data0;
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
index 17d893a..198250b 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
@@ -126,7 +126,11 @@
const int pad_x, const int pad_y, const int stride_x, const int stride_y)
{
int start_x = get_global_id(1) * stride_x - pad_x;
- int start_y = get_global_id(2) * stride_y - pad_y;
+#if defined(DST_DEPTH)
+ int start_y = (get_global_id(2) % DST_DEPTH) * stride_y - pad_y;
+#else /* defined(DST_DEPTH) */
+ int start_y = get_global_id(2) * stride_y - pad_y;
+#endif /* defined(DST_DEPTH) */
const int end_x = min(start_x + pool_size_x, upper_bound_w);
const int end_y = min(start_y + pool_size_y, upper_bound_h);
@@ -153,39 +157,58 @@
* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
* @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
*/
__kernel void pooling_layer_MxN_quantized_nhwc(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
{
// Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#if defined(DST_DEPTH)
+ Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
+ Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+#else /* defined(DST_DEPTH) */
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* defined(DST_DEPTH) */
int8 vdata = 0;
- const int idx_width = get_global_id(1) * STRIDE_X;
- const int idx_height = get_global_id(2) * STRIDE_Y;
+ const int idx_width = get_global_id(1) * STRIDE_X;
+#if defined(DST_DEPTH)
+ const int idx_height = (get_global_id(2) % DST_DEPTH) * STRIDE_Y;
+#else /* defined(DST_DEPTH) */
+ const int idx_height = get_global_id(2) * STRIDE_Y;
+#endif /* defined(DST_DEPTH) */
for(int y = 0; y < POOL_SIZE_Y; ++y)
{
- int y1 = select(y, PAD_Y - idx_height, y + idx_height < PAD_Y || y + idx_height > MAX_HEIGHT);
+ int y1 = select(y, PAD_Y - idx_height, y + idx_height - PAD_Y < 0 || y + idx_height - PAD_Y >= MAX_HEIGHT);
for(int x = 0; x < POOL_SIZE_X; ++x)
{
- int x1 = select(x, PAD_X - idx_width - 1, x + idx_width < PAD_X || x + idx_width > MAX_WIDTH);
- x1 = select(x1, PAD_X - idx_width - 1, y != y1);
- uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
- int8 data0 = convert_int8(data);
- vdata = POOL_OP(vdata, data0);
+ int x1 = select(x, PAD_X - idx_width - 1, x + idx_width - PAD_X < 0 || x + idx_width - PAD_X >= MAX_WIDTH);
+ x1 = select(x1, PAD_X - idx_width - 1, y != y1);
+
+#if defined(DST_DEPTH)
+ uchar8 data = vload8(0, (__global uchar *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
+#else /* defined(DST_DEPTH) */
+ uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+#endif /* defined(DST_DEPTH) */
+
+ int8 data0 = convert_int8(data);
+ vdata = POOL_OP(vdata, data0);
}
}
diff --git a/src/core/CL/cl_kernels/prior_box_layer.cl b/src/core/CL/cl_kernels/prior_box_layer.cl
new file mode 100644
index 0000000..be072ec
--- /dev/null
+++ b/src/core/CL/cl_kernels/prior_box_layer.cl
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3)
+
+/** Compute prior boxes and clip (NCHW)
+ *
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] idx Index to write to
+ * @param[in] center_x Center value of the x axis
+ * @param[in] center_y Center value of the y axis
+ * @param[in] box_width Prior box width
+ * @param[in] box_height Prior box height
+ *
+ */
+inline void calculate_xy_min_max_nchw(Image *out, int idx, float center_x, float center_y, float box_width, float box_height)
+{
+ float xmin = (center_x - box_width / 2.f) / WIDTH;
+ float ymin = (center_y - box_height / 2.f) / HEIGHT;
+ float xmax = (center_x + box_width / 2.f) / WIDTH;
+ float ymax = (center_y + box_height / 2.f) / HEIGHT;
+
+#if defined(CLIP)
+ xmin = clamp(xmin, 0.f, 1.f);
+ ymin = clamp(ymin, 0.f, 1.f);
+ xmax = clamp(xmax, 0.f, 1.f);
+ ymax = clamp(ymax, 0.f, 1.f);
+#endif // defined(CLIP)
+
+ // Store result
+ vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(xmin, ymin, xmax, ymax), 0, ((__global DATA_TYPE *)offset(out, idx + 0, 0)));
+}
+
+/** Compute prior boxes (NCHW)
+ *
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] min_size Prior box min size
+ * @param[in] min_idx Index of the min vector
+ * @param[in] idx Index to write to
+ *
+ * @return The updated index
+ */
+inline int calculate_min_nchw(Image *out, __global float *max, __global float *aspect_ratios, int max_size, int aspect_ratios_size, float min_size, int min_idx, int idx)
+{
+ const float center_x = ((float)(get_global_id(0) % LAYER_WIDTH) + OFFSET) * STEP_X;
+ const float center_y = ((float)(get_global_id(0) / LAYER_WIDTH) + OFFSET) * STEP_Y;
+
+ float box_width = min_size;
+ float box_height = min_size;
+ calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+ idx += 4;
+
+ if(max_size > 0)
+ {
+ box_width = sqrt(min_size * max[min_idx]);
+ box_height = box_width;
+ calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+ idx += 4;
+ }
+ for(unsigned int i = 0; i < aspect_ratios_size; ++i)
+ {
+ if(fabs(aspect_ratios[i] - 1.f) < 1e-6f)
+ {
+ continue;
+ }
+ box_width = min_size * sqrt(aspect_ratios[i]);
+ box_height = min_size * rsqrt(aspect_ratios[i]);
+
+ calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+ idx += 4;
+ }
+
+ return idx;
+}
+
+/** Compute prior boxes and clip (NHWC)
+ *
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] idx Index to write to
+ * @param[in] center_x Center value of the x axis
+ * @param[in] center_y Center value of the y axis
+ * @param[in] box_width Prior box width
+ * @param[in] box_height Prior box height
+ *
+ */
+inline void calculate_xy_min_max_nhwc(Tensor3D *out, int idx, float center_x, float center_y, float box_width, float box_height)
+{
+ float xmin = (center_x - box_width / 2.f) / WIDTH;
+ float ymin = (center_y - box_height / 2.f) / HEIGHT;
+ float xmax = (center_x + box_width / 2.f) / WIDTH;
+ float ymax = (center_y + box_height / 2.f) / HEIGHT;
+
+#if defined(CLIP)
+ xmin = clamp(xmin, 0.f, 1.f);
+ ymin = clamp(ymin, 0.f, 1.f);
+ xmax = clamp(xmax, 0.f, 1.f);
+ ymax = clamp(ymax, 0.f, 1.f);
+#endif // defined(CLIP)
+
+ *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 0, 0)) = xmin;
+ *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 1, 0)) = ymin;
+ *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 2, 0)) = xmax;
+ *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 3, 0)) = ymax;
+}
+
+/** Compute prior boxes (NHWC)
+ *
+ * @param[in,out] out Tensor output
+ * @param[in] max The maximum values
+ * @param[in] aspect_ratios The aspect ratio values
+ * @param[in] max_size The maximum values values size
+ * @param[in] aspect_ratios_size The aspect ratio values size
+ * @param[in] min_size The minimum values size
+ * @param[in] min_idx Index of the min vector
+ * @param[in] idx Index to write to
+ *
+ * @return The updated index
+ */
+inline int calculate_min_nhwc(Tensor3D *out, __global float *max, __global float *aspect_ratios, int max_size, int aspect_ratios_size, float min_size, int min_idx, int idx)
+{
+ const float center_x = ((float)(get_global_id(1) % LAYER_WIDTH) + OFFSET) * STEP_X;
+ const float center_y = ((float)(get_global_id(1) / LAYER_WIDTH) + OFFSET) * STEP_Y;
+
+ float box_width = min_size;
+ float box_height = min_size;
+
+ calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
+ idx += 4;
+ if(max_size > 0)
+ {
+ box_width = sqrt(min_size * max[min_idx]);
+ box_height = box_width;
+ calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
+ idx += 4;
+ }
+ for(unsigned int i = 0; i < aspect_ratios_size; ++i)
+ {
+ if(fabs(aspect_ratios[i] - 1.f) < 1e-6f)
+ {
+ continue;
+ }
+ box_width = min_size * sqrt(aspect_ratios[i]);
+ box_height = min_size * rsqrt(aspect_ratios[i]);
+
+ calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
+ idx += 4;
+ }
+
+ return idx;
+}
+
+/** Calculate prior boxes with NCHW format.
+ *
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] min The minimum values
+ * @param[in] max The maximum_values
+ * @param[in] aspect_ratios The aspect ratio values
+ * @param[in] min_size The minimum values size
+ * @param[in] max_size The maximum_values values size
+ * @param[in] aspect_ratios_size The aspect ratio values size
+ */
+__kernel void prior_box_layer_nchw(IMAGE_DECLARATION(output), __global float *min, __global float *max, __global float *aspect_ratios, unsigned int min_size, unsigned int max_size,
+ unsigned int aspect_ratios_size)
+{
+ Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+ int idx = 0;
+ for(unsigned int i = 0; i < min_size; ++i)
+ {
+ idx = calculate_min_nchw(&out, max, aspect_ratios, max_size, aspect_ratios_size, min[i], i, idx);
+ }
+
+ // Store variances
+ for(int i = 0; i < (NUM_PRIORS * 4); i += 4)
+ {
+ vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(VARIANCE_0, VARIANCE_1, VARIANCE_2, VARIANCE_3), 0, ((__global DATA_TYPE *)offset(&out, i, 1)));
+ }
+}
+
+/** Calculate prior boxes with NHWC format.
+ *
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] min The minimum values
+ * @param[in] max The maximum_values
+ * @param[in] aspect_ratios The aspect ratio values
+ * @param[in] min_size The minimum values size
+ * @param[in] max_size The maximum_values values size
+ * @param[in] aspect_ratios_size The aspect ratio values size
+ */
+__kernel void prior_box_layer_nhwc(TENSOR3D_DECLARATION(output), __global float *min, __global float *max, __global float *aspect_ratios, unsigned int min_size, unsigned int max_size,
+ unsigned int aspect_ratios_size)
+{
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ int idx = 0;
+ for(unsigned int i = 0; i < min_size; ++i)
+ {
+ idx = calculate_min_nhwc(&out, max, aspect_ratios, max_size, aspect_ratios_size, min[i], i, idx);
+ }
+
+ for(int i = 0; i < (NUM_PRIORS * 4); i += 4)
+ {
+ *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 0, 1)) = VARIANCE_0;
+ *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 1, 1)) = VARIANCE_1;
+ *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 2, 1)) = VARIANCE_2;
+ *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 3, 1)) = VARIANCE_3;
+ }
+}
+#endif /* defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3) */
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index aa7403b..d76e12a 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -61,13 +61,14 @@
return (in.s0 + in.s1);
}
-/** This kernel performs reduction given an operation.
+/** This kernel performs parallel reduction given an operation on x-axis.
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
* @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
+ * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
+ * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -81,7 +82,7 @@
* @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] local_sums Local buffer for storing the partial sum
*/
-__kernel void reduction_operation(
+__kernel void reduction_operation_x(
IMAGE_DECLARATION(src),
IMAGE_DECLARATION(partial_sum),
__local DATA_TYPE *local_sums)
@@ -109,7 +110,207 @@
if(lid == 0)
{
+#if defined(MEAN) && defined(WIDTH)
+ if(y == get_local_size(1) - 1)
+ {
+ local_sums[0] /= WIDTH;
+ }
+#endif /* defined(MEAN) && defined(WIDTH) */
((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0];
}
}
-}
\ No newline at end of file
+}
+
+#if defined(WIDTH)
+/** This kernel performs reduction on x-axis. (QASYMM8)
+ *
+ * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
+ * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_quantized_x(
+ VECTOR_DECLARATION(src),
+ VECTOR_DECLARATION(output))
+{
+ Vector src = CONVERT_TO_VECTOR_STRUCT(src);
+ Vector output = CONVERT_TO_VECTOR_STRUCT(output);
+
+ uint res = 0;
+
+ for(unsigned int x = 0; x < WIDTH; ++x)
+ {
+ res += *((__global uchar *)vector_offset(&src, x));
+ }
+
+#if defined(MEAN)
+ res /= WIDTH;
+#endif /* defined(MEAN) */
+
+ // Store result
+ *((__global uchar *)output.ptr) = convert_uchar(res);
+}
+#endif /* defined(HEIGHT) */
+
+#if defined(HEIGHT)
+/** This kernel performs reduction on y-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
+ * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_y(
+ IMAGE_DECLARATION(src),
+ IMAGE_DECLARATION(output))
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Image output = CONVERT_TO_IMAGE_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+ res = 0;
+
+ for(unsigned int y = 0; y < HEIGHT; ++y)
+ {
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+ in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+ in *= in;
+#endif // SQRSUM
+ res += in;
+ }
+
+#if defined(MEAN)
+ res /= HEIGHT;
+#endif /* defined(MEAN) */
+
+ // Store result
+ vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif /* defined(HEIGHT) */
+
+#if defined(DEPTH)
+/** This kernel performs reduction on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt
+ * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_z(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+ res = 0;
+
+ for(unsigned int z = 0; z < DEPTH; ++z)
+ {
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+ in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+ in *= in;
+#endif // SQRSUM
+ res += in;
+ }
+
+#if defined(MEAN)
+ res /= DEPTH;
+#endif /* defined(MEAN) */
+
+ // Store result
+ vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif /* defined(DEPTH) */
+
+#if defined(BATCH) && defined(DEPTH)
+/** This kernel performs reduction on w-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
+ * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt
+ * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_w(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
+{
+ Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
+ Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
+
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+ res = 0;
+
+ for(unsigned int w = 0; w < BATCH; ++w)
+ {
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+ in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+ in *= in;
+#endif // SQRSUM
+ res += in;
+ }
+
+#if defined(MEAN)
+ res /= BATCH;
+#endif /* defined(MEAN) */
+
+ // Store result
+ vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif /* defined(BATCH) && defined(DEPTH) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/reorg_layer.cl b/src/core/CL/cl_kernels/reorg_layer.cl
new file mode 100644
index 0000000..a275699
--- /dev/null
+++ b/src/core/CL/cl_kernels/reorg_layer.cl
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+
+#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi) \
+ ({ \
+ int offset = zo / (int)SRC_DEPTH; \
+ xi = xo * (int)STRIDE + offset % (int)STRIDE; \
+ yi = yo * (int)STRIDE + offset / (int)STRIDE; \
+ zi = zo % SRC_DEPTH; \
+ })
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reorg_layer_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ int xo = get_global_id(0);
+ int yo = get_global_id(1);
+ int zo = get_global_id(2);
+ int xi, yi, zi;
+
+ CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
+
+ int src_offset = xi * sizeof(DATA_TYPE) + yi * src_stride_y + zi * src_stride_z;
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
+}
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reorg_layer_nhwc(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ int xo = get_global_id(1);
+ int yo = get_global_id(2);
+ int zo = get_global_id(0);
+ int xi, yi, zi;
+
+ CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
+
+ int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
+}
+#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/roi_align_layer.cl b/src/core/CL/cl_kernels/roi_align_layer.cl
new file mode 100644
index 0000000..f52eb18
--- /dev/null
+++ b/src/core/CL/cl_kernels/roi_align_layer.cl
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
+// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
+#define EPS_GRID 0.00001f
+
+#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) // Check for compile time constants
+
+/** Performs a roi align on a single output pixel.
+ *
+ * @param[in] input Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y End y index projected onto the input tensor.
+ * @param[in] pz z index of the input tensor.
+ *
+ * @return An average pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x,
+ float bin_size_x,
+ float grid_size_x,
+ float region_end_x,
+ float region_start_y,
+ float bin_size_y,
+ float grid_size_y,
+ float region_end_y,
+ int pz)
+{
+ // Iterate through the pooling region
+ float sum = 0;
+ for(int iy = 0; iy < grid_size_y; ++iy)
+ {
+ for(int ix = 0; ix < grid_size_x; ++ix)
+ {
+ // Align the window in the middle of every bin
+ const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y;
+ const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x;
+
+ // Interpolation in the unit square
+ const int y_low = (int)y;
+ const int x_low = (int)x;
+ const int y_high = y_low + 1;
+ const int x_high = x_low + 1;
+
+ const float ly = y - y_low;
+ const float lx = x - x_low;
+ const float hy = 1.f - ly;
+ const float hx = 1.f - lx;
+
+ const float w1 = hy * hx;
+ const float w2 = hy * lx;
+ const float w3 = ly * hx;
+ const float w4 = ly * lx;
+
+ const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
+ const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
+ const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
+ const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
+ sum += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+ }
+ }
+
+ return (DATA_TYPE)(sum / (grid_size_x * grid_size_y));
+}
+
+/** Performs a roi align function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
+ * will have a default sampling ratio of roi_dims/pooling_dims
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the pooled region of the source tensor as specifed by ROI
+ * @param[in] rois_ptr Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
+ * @param[in] rois_stride_x Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in] rois_step_x Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in] rois_stride_y Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in] rois_step_y Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in] rois_offset_first_element_in_bytes The offset of the first element in the ROIs tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void roi_align_layer(
+ TENSOR3D_DECLARATION(input),
+ IMAGE_DECLARATION(rois),
+ TENSOR3D_DECLARATION(output),
+ unsigned int input_stride_w, unsigned int output_stride_w)
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+ Image rois = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+ const int px = get_global_id(0);
+ const int py = get_global_id(1);
+ const int pw = get_global_id(2);
+
+ // Load roi parameters
+ // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+ const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
+ const VEC_DATA_TYPE(DATA_TYPE, 4)
+ roi = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
+ const float2 roi_anchor = convert_float2(roi.s01) * convert_float(SPATIAL_SCALE);
+ const float2 roi_dims = fmax(convert_float2(roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
+
+ // Calculate pooled region start and end
+ const float2 spatial_indx = (float2)(px, py);
+ const float2 pooled_dims = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+ const float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
+
+ const float2 bin_size = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
+ float2 region_start = spatial_indx * bin_size + roi_anchor;
+ float2 region_end = (spatial_indx + 1) * bin_size + roi_anchor;
+
+ region_start = clamp(region_start, 0, max_spatial_dims);
+ region_end = clamp(region_end, 0, max_spatial_dims);
+
+#if defined(SAMPLING_RATIO)
+ const float2 roi_bin_grid = SAMPLING_RATIO;
+#else // !defined(SAMPLING_RATIO)
+ // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
+ const float2 roi_bin_grid = ceil(bin_size - EPS_GRID);
+#endif // defined(SAMPLING_RATIO)
+
+ // Move input and output pointer across the fourth dimension
+ input.ptr += roi_batch * input_stride_w;
+ output.ptr += pw * output_stride_w;
+ for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+ {
+ *(__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz) = (__global DATA_TYPE)roi_align_1x1(&input,
+ region_start.x,
+ bin_size.x,
+ roi_bin_grid.x,
+ region_end.x,
+ region_start.y,
+ bin_size.y,
+ roi_bin_grid.y,
+ region_end.y, pz);
+ }
+}
+#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/scale_quantized.cl
new file mode 100644
index 0000000..3211e7e
--- /dev/null
+++ b/src/core/CL/cl_kernels/scale_quantized.cl
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+#include "warp_helpers_quantized.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
+{
+ const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+ const float4 new_x = in_x_coords * (float4)(scale.s0);
+ const float4 new_y = (float4)(coord.s1 * scale.s1);
+ return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+ const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+ const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+ return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
+ * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] input_width Input image width
+ * @param[in] input_height Input image height
+ * @param[in] scale_x The scale factor along x dimension
+ * @param[in] scale_y The scale factor along y dimension
+ */
+__kernel void scale_bilinear_quantized_nchw(
+ IMAGE_DECLARATION(in),
+ IMAGE_DECLARATION(out),
+ const float input_width,
+ const float input_height,
+ const float scale_x,
+ const float scale_y)
+{
+ Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+ Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ const float2 r = (float2)(scale_x, scale_y);
+ const float8 tc = transform_bilinear_quantized(get_current_coords_quantized(), r);
+ vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
+}
+
+/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
+ *
+ * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
+ * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
+ * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in_stride_z Stride of the source image in Z dimension (in bytes)
+ * @param[in] in_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in_ptr
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes)
+ * @param[in] out_step_z dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] input_width Input image width
+ * @param[in] input_height Input image height
+ * @param[in] scale_x The scale factor along x dimension
+ * @param[in] scale_y The scale factor along y dimension
+ */
+__kernel void scale_bilinear_quantized_nhwc(
+ TENSOR3D_DECLARATION(in),
+ TENSOR3D_DECLARATION(out),
+ const float input_width,
+ const float input_height,
+ const float scale_x,
+ const float scale_y)
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+#ifdef SAMPLING_POLICY_TOP_LEFT
+ const float new_x = get_global_id(1) * scale_x;
+ const float new_y = get_global_id(2) * scale_y;
+#elif SAMPLING_POLICY_CENTER
+ const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
+ const float new_y = (get_global_id(2) + 0.5f) * scale_y - 0.5f;
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+
+ const float new_xf = floor(new_x);
+ const float new_yf = floor(new_y);
+ float clamped_x = clamp(new_xf, 0.0f, input_width - 1);
+ float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1);
+ float clamped_x_ = clamped_x;
+ float clamped_x1_ = clamped_x1;
+ const float clamped_y = clamp(new_yf, 0.0f, input_height - 1);
+ const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
+
+#ifndef BORDER_MODE_REPLICATE
+ clamped_x1 = select(clamped_x1, 0.0f - BORDER_SIZE, new_yf + 1 < 0.f || new_yf + 1 > input_height - 1 || new_xf + 1 < 0.f || new_xf + 1 > input_width - 1);
+ clamped_x_ = select(clamped_x_, 0.0f - BORDER_SIZE, new_yf + 1 > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
+ clamped_x = select(clamped_x, 0.0f - BORDER_SIZE, new_yf < 0.f || new_yf > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
+ clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
+#endif /* BORDER_MODE_REPLICATE */
+
+ int4 ins = (int4)(*((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y))),
+ *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y))),
+ *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1))),
+ *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1))));
+
+ const float a = new_x - new_xf;
+ const float b = 1.f - a;
+ const float a1 = new_y - new_yf;
+ const float b1 = 1.f - a1;
+ const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE;
+
+ const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1));
+
+ uchar res = convert_uchar_sat(convert_int_sat_rtp(fr / SCALE) + OFFSET);
+
+ *((__global DATA_TYPE *)out.ptr) = res;
+}
diff --git a/src/core/CL/cl_kernels/slice_ops.cl b/src/core/CL/cl_kernels/slice_ops.cl
new file mode 100644
index 0000000..bc3df47
--- /dev/null
+++ b/src/core/CL/cl_kernels/slice_ops.cl
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform a strided slice operation on a given input.
+ *
+ * @attention Supported tensor rank: up to 4
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input and output tensor dephts should be given as a preprocessor arguments using -DSRC_DEPTH=size. and -DDST_DEPTH=size
+ * @attention Absolute start coordinates for each dimension should be given as preprocessor -DSTART_index=value e.g. -DSTART_0=2
+ * @attention Strides for each dimension should be given as preprocessor -DSTRIDE_index=value e.g. -DSTRIDE_1=1
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void strided_slice(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
+ Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+
+ int offset = 0;
+
+ // Offset X
+#if defined(START_0) && defined(STRIDE_0) && defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi = (int)(get_global_id(0) * VEC_SIZE);
+ offset = (int)START_0 + min(xi, (int)LAST_ACCESSED_X);
+ input.ptr += offset * input_stride_x;
+ output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+#elif defined(START_0) && defined(STRIDE_0)
+ offset = (int)START_0 + (int)get_global_id(0) * (int)STRIDE_0;
+ input.ptr += offset * input_stride_x;
+#endif // defined(START_0) && defined(STRIDE_0)
+
+ // Offset Y
+#if defined(START_1) && defined(STRIDE_1)
+ offset = (int)START_1 + (int)get_global_id(1) * (int)STRIDE_1;
+ input.ptr += offset * input_stride_y;
+#endif // defined(START_1) && defined(STRIDE_1)
+
+ // Offset Z
+#if defined(START_2) && defined(STRIDE_2)
+ offset = (int)START_2 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_2;
+ input.ptr += offset * input_stride_z;
+#endif // defined(START_2) && defined(STRIDE_2)
+
+ // Offset depth
+#if defined(START_3) && defined(STRIDE_3)
+ offset = (int)START_3 + ((int)get_global_id(2) / (int)DST_DEPTH) * (int)STRIDE_3;
+ input.ptr += offset * input_stride_w;
+#endif // defined(START_3) && defined(STRIDE_3)
+
+ // Store result
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input.ptr));
+
+ VSTORE(VEC_SIZE)
+ (val, 0, (__global DATA_TYPE *)(output.ptr));
+#else // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+ *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 4ad8180..e549b44 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -64,6 +64,7 @@
#endif /* VECTOR_SIZE END */
+// TODO (COMPMID-661): Remove if the non-fused kernels are removed
__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
__constant uint4 idx4 = (uint4)(0, 1, 2, 3);
@@ -344,6 +345,7 @@
}
#ifdef NON_MULTIPLE_OF_GRID_SIZE
// How many work-items needed to complete the computation.
+ //TODO: Optimize this calculation (avoid %).
int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
if(lid < boundary_workitems)
{
@@ -459,6 +461,7 @@
sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
}
#ifdef NON_MULTIPLE_OF_GRID_SIZE
+ //TODO: Optimize the calculation (avoid %).
boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
if(lid < boundary_workitems)
{
diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
index fcd1ec5..95d6d4b 100644
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
@@ -301,6 +301,7 @@
}
#ifdef NON_MULTIPLE_OF_GRID_SIZE
// How many work-items needed to complete the computation.
+ //TODO: Optimize this calculation (avoid %).
int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
if(lid < boundary_workitems)
{
@@ -410,6 +411,7 @@
sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
}
#ifdef NON_MULTIPLE_OF_GRID_SIZE
+ //TODO: Optimize the calculation (avoid %).
boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
if(lid < boundary_workitems)
{
diff --git a/src/core/CL/cl_kernels/space_to_batch.cl b/src/core/CL/cl_kernels/space_to_batch.cl
new file mode 100644
index 0000000..d42a79d
--- /dev/null
+++ b/src/core/CL/cl_kernels/space_to_batch.cl
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software withoutput restriction, including withoutput limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KOUTD, EXPRESS OR
+ * IMPLIED, OUTCLUDOUTG BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONOUTFROUTGEMENT. OUT NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER OUT AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISOUTG FROM,
+ * OUT OF OR OUT CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALOUTGS OUT THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE)
+/** Calculate the space to batch conversion.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in] paddings_ptr Pointer to the second source image. Supported data types: S32
+ * @param[in] paddings_stride_x Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in] paddings_step_x paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] paddings_stride_y Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in] paddings_step_y paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] paddingse_offset_first_element_in_bytes The offset of the first element in the second source image
+ * @param[in] block_shape_ptr Pointer to the block shape tensor. Supported data types: S32
+ * @param[in] block_shape_stride_x Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] block_shape_stride_y Stride of the block shape tensor in Y dimension (in bytes)
+ * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in] batch_id The output tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nchw(
+ TENSOR4D_DECLARATION(input),
+ IMAGE_DECLARATION(paddings),
+ VECTOR_DECLARATION(block_shape),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Image pad = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+ Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ const int pad_left_x = *((__global int *)offset(&pad, 0, 0));
+ const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+ const int pad_left_y = *((__global int *)offset(&pad, 0, 1));
+ const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+ int block_x = *((__global int *)vector_offset(&block, 0));
+ int block_y = *((__global int *)vector_offset(&block, 1));
+
+ const int out_x = get_global_id(0);
+ const int out_y = get_global_id(1);
+ const int z = get_global_id(2);
+
+ if((out_x >= pad_left_x && out_x < WIDTH_OUT - pad_right_x) && (out_y >= pad_left_y && out_y < HEIGHT_OUT - pad_right_y))
+ {
+ const int r = (BATCH_SIZE / (block_x * block_y));
+ const int w = batch_id % r;
+ const int in_x = (out_x - pad_left_x) * block_x + (batch_id / r) % block_x;
+ const int in_y = (out_y - pad_left_y) * block_y + (batch_id / r) / block_x;
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
+ }
+}
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in] paddings_ptr Pointer to the second source image. Supported data types: S32
+ * @param[in] paddings_stride_x Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in] paddings_step_x paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] paddings_stride_y Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in] paddings_step_y paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] paddingse_offset_first_element_in_bytes The offset of the first element in the second source image
+ * @param[in] block_shape_ptr Pointer to the block shape tensor. Supported data types: S32
+ * @param[in] block_shape_stride_x Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] block_shape_stride_y Stride of the block shape tensor in Y dimension (in bytes)
+ * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in] batch_id The output tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nhwc(
+ TENSOR4D_DECLARATION(input),
+ IMAGE_DECLARATION(paddings),
+ VECTOR_DECLARATION(block_shape),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Image pad = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+ Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ const int pad_left_x = *((__global int *)offset(&pad, 0, 0));
+ const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+ const int pad_left_y = *((__global int *)offset(&pad, 0, 1));
+ const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+ int block_x = *((__global int *)vector_offset(&block, 0));
+ int block_y = *((__global int *)vector_offset(&block, 1));
+
+ const int out_x = get_global_id(1);
+ const int out_y = get_global_id(2);
+ const int z = get_global_id(0);
+
+ if((out_x >= pad_left_x && out_x < WIDTH_OUT - pad_right_x) && (out_y >= pad_left_y && out_y < HEIGHT_OUT - pad_right_y))
+ {
+ const int r = (BATCH_SIZE / (block_x * block_y));
+ const int w = batch_id % r;
+ const int in_x = (out_x - pad_left_x) * block_x + (batch_id / r) % block_x;
+ const int in_y = (out_y - pad_left_y) * block_y + (batch_id / r) / block_x;
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+ }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)
+/** Calculate the space to batch conversion.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in] batch_id The output tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nchw(
+ TENSOR4D_DECLARATION(input),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ int block_x = BLOCK_SHAPE_X;
+ int block_y = BLOCK_SHAPE_Y;
+
+ const int out_x = get_global_id(0);
+ const int out_y = get_global_id(1);
+ const int z = get_global_id(2);
+
+ if((out_x >= PAD_LEFT_X && out_x < WIDTH_OUT - PAD_RIGHT_X) && (out_y >= PAD_LEFT_Y && out_y < HEIGHT_OUT - PAD_RIGHT_Y))
+ {
+ const int r = (BATCH_SIZE / (block_x * block_y));
+ const int w = batch_id % r;
+ const int in_x = (out_x - PAD_LEFT_X) * block_x + (batch_id / r) % block_x;
+ const int in_y = (out_y - PAD_LEFT_Y) * block_y + (batch_id / r) / block_x;
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
+ }
+}
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in] batch_id The output tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nhwc(
+ TENSOR4D_DECLARATION(input),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ int block_x = BLOCK_SHAPE_X;
+ int block_y = BLOCK_SHAPE_Y;
+
+ const int out_x = get_global_id(1);
+ const int out_y = get_global_id(2);
+ const int z = get_global_id(0);
+
+ if((out_x >= PAD_LEFT_X && out_x < WIDTH_OUT - PAD_RIGHT_X) && (out_y >= PAD_LEFT_Y && out_y < HEIGHT_OUT - PAD_RIGHT_Y))
+ {
+ const int r = (BATCH_SIZE / (block_x * block_y));
+ const int w = batch_id % r;
+ const int in_x = (out_x - PAD_LEFT_X) * block_x + (batch_id / r) % block_x;
+ const int in_y = (out_y - PAD_LEFT_Y) * block_y + (batch_id / r) / block_x;
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+ }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)
diff --git a/src/core/CL/cl_kernels/upsample_layer.cl b/src/core/CL/cl_kernels/upsample_layer.cl
new file mode 100644
index 0000000..65912f5
--- /dev/null
+++ b/src/core/CL/cl_kernels/upsample_layer.cl
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image. (NCHW)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi_in = (int)(get_global_id(0) * VEC_SIZE_IN);
+ const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+ src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+ dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ data = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7);
+
+ vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr);
+ vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+#else // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+}
+
+/** This function applies upsample on an input image. (NHWC)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nhwc(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi_in = (int)(get_global_id(0) * VEC_SIZE_IN);
+ const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+ src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+ dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ data = vload16(0, (__global DATA_TYPE *)src.ptr);
+
+ vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0));
+ vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+ vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1));
+ vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1));
+#else // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1)) = *((__global DATA_TYPE *)src.ptr);
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+}
diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
index 86a5e06..9afec7d 100644
--- a/src/core/CL/cl_kernels/warp_helpers.h
+++ b/src/core/CL/cl_kernels/warp_helpers.h
@@ -38,6 +38,7 @@
return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
}
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
/** Clamps the given coordinates to the borders.
*
* @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
@@ -125,6 +126,7 @@
return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
}
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
*
* @param[in] in Pointer to the source image.
diff --git a/src/core/CL/cl_kernels/warp_helpers_quantized.h b/src/core/CL/cl_kernels/warp_helpers_quantized.h
new file mode 100644
index 0000000..48d6fae
--- /dev/null
+++ b/src/core/CL/cl_kernels/warp_helpers_quantized.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+/** Clamps the given coordinates to the borders according to the border size.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width Width of the image
+ * @param[in] height Height of the image
+ * @param[in] border_size Border size of the image
+ *
+ */
+inline const float8 clamp_to_border_with_size_quantized(float8 coords, const float width, const float height, const float border_size)
+{
+ const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
+ const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
+ return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border_quantized(float8 coords, const float width, const float height)
+{
+ return clamp_to_border_with_size_quantized(coords, width, height, 1);
+}
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+ */
+inline const float8 get_neighbour_coords_quantized(const float2 coord)
+{
+ return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords_quantized()
+{
+ return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in Pointer to the source image.
+ * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width Width of the image
+ * @param[in] height Height of the image
+ * @param[in] border_size Border size
+ * @param[in] scale Scale value
+ * @param[in] offset_qasymm Offset value
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border_quantized(const Image *in, const float8 coords, const float width, const float height, const float border_size,
+ const float scale, const int offset_qasymm)
+{
+ // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
+
+ // Sets the 4x4 coordinates for each of the four input texels
+ const float8 fc = floor(coords);
+ const float16 c1 = (float16)(
+ clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s0, fc.s1)), width, height, border_size),
+ clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s2, fc.s3)), width, height, border_size));
+ const float16 c2 = (float16)(
+ clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s4, fc.s5)), width, height, border_size),
+ clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s6, fc.s7)), width, height, border_size));
+
+ // Loads the values from the input image
+ const int16 t = (int16)(
+ /* tl, tr, bl, br */
+ * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+ *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+ *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+ *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+ *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+ *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+ *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+ *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+
+ const float16 inf32 = convert_float16(t - (int16)offset_qasymm) * (float16)scale;
+
+ const float8 a = coords - fc;
+ const float8 b = ((float8)(1.f)) - a;
+ const float4 fr = (float4)(
+ ((inf32.s0 * b.s0 * b.s1) + (inf32.s1 * a.s0 * b.s1) + (inf32.s2 * b.s0 * a.s1) + (inf32.s3 * a.s0 * a.s1)),
+ ((inf32.s4 * b.s2 * b.s3) + (inf32.s5 * a.s2 * b.s3) + (inf32.s6 * b.s2 * a.s3) + (inf32.s7 * a.s2 * a.s3)),
+ ((inf32.s8 * b.s4 * b.s5) + (inf32.s9 * a.s4 * b.s5) + (inf32.sa * b.s4 * a.s5) + (inf32.sb * a.s4 * a.s5)),
+ ((inf32.sc * b.s6 * b.s7) + (inf32.sd * a.s6 * b.s7) + (inf32.se * b.s6 * a.s7) + (inf32.sf * a.s6 * a.s7)));
+
+ const uchar4 res = convert_uchar4_sat(convert_int4_sat_rtp(fr / scale) + offset_qasymm);
+
+ return res;
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in Pointer to the source image.
+ * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width Width of the image
+ * @param[in] height Height of the image
+ * @param[in] scale Scale value
+ * @param[in] offset_qasymm Offset value
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_quantized(const Image *in, const float8 coords, const float width, const float height, const float scale, const int offset_qasymm)
+{
+ return bilinear_interpolate_with_border_quantized(in, coords, width, height, 1, scale, offset_qasymm);
+}
diff --git a/src/core/CL/cl_kernels/winograd_filter_transform.cl b/src/core/CL/cl_kernels/winograd_filter_transform.cl
index 73da005..3b9b1e9 100644
--- a/src/core/CL/cl_kernels/winograd_filter_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_filter_transform.cl
@@ -30,8 +30,9 @@
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
* @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -60,45 +61,54 @@
// Load the values from the input tensor
#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
- float3 w0 = vload3(0, (__global float *)(src_addr));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- float3 w0 = (float3)(*((__global float *)(src_addr + 0 * src_stride_y)),
- *((__global float *)(src_addr + 1 * src_stride_y)),
- *((__global float *)(src_addr + 2 * src_stride_y)));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- float3 w0 = vload3(0, (__global float *)(src_addr + 0 * src_stride_y));
- float3 w1 = vload3(0, (__global float *)(src_addr + 1 * src_stride_y));
- float3 w2 = vload3(0, (__global float *)(src_addr + 2 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
// Row 0
- float4 out0 = 0.0f;
- out0.s0 = (w0.s0);
- out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
- out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
- out0.s3 = (w0.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out0 = 0.0f;
+ out0.s0 = (w0.s0);
+ out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
+ out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
+ out0.s3 = (w0.s2);
#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
// Row 1
- float4 out1 = 0.0f;
- out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
- out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
- out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
- out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out1 = 0.0f;
+ out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
+ out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
+ out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
+ out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
// Row 2
- float4 out2 = 0.0f;
- out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
- out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
- out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
- out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out2 = 0.0f;
+ out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
+ out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
+ out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
+ out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
// Row 3
- float4 out3 = 0.0f;
- out3.s0 = (w2.s0);
- out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
- out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
- out3.s3 = (w2.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out3 = 0.0f;
+ out3.s0 = (w2.s0);
+ out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
+ out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
+ out3.s3 = (w2.s2);
#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
int z = get_global_id(2);
@@ -111,24 +121,24 @@
// Store the values across the channels
// 16 channels for 3x3 kernels
// 4 channels for 3x1 or 1x3 kernels
- *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0;
- *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1;
- *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2;
- *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- *(__global float *)(dst_addr + 4 * dst_stride_z) = out1.s0;
- *(__global float *)(dst_addr + 5 * dst_stride_z) = out1.s1;
- *(__global float *)(dst_addr + 6 * dst_stride_z) = out1.s2;
- *(__global float *)(dst_addr + 7 * dst_stride_z) = out1.s3;
- *(__global float *)(dst_addr + 8 * dst_stride_z) = out2.s0;
- *(__global float *)(dst_addr + 9 * dst_stride_z) = out2.s1;
- *(__global float *)(dst_addr + 10 * dst_stride_z) = out2.s2;
- *(__global float *)(dst_addr + 11 * dst_stride_z) = out2.s3;
- *(__global float *)(dst_addr + 12 * dst_stride_z) = out3.s0;
- *(__global float *)(dst_addr + 13 * dst_stride_z) = out3.s1;
- *(__global float *)(dst_addr + 14 * dst_stride_z) = out3.s2;
- *(__global float *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out1.s0;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out1.s1;
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s2;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s3;
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out2.s0;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out2.s1;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
}
@@ -137,8 +147,9 @@
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
* @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -167,71 +178,82 @@
// Load the values from the input tensor
#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
- float3 w0 = vload3(0, (__global float *)(src_addr));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- float3 w0 = (float3)(*((__global float *)(src_addr + 0 * src_stride_y)),
- *((__global float *)(src_addr + 1 * src_stride_y)),
- *((__global float *)(src_addr + 2 * src_stride_y)));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- float3 w0 = vload3(0, (__global float *)(src_addr + 0 * src_stride_y));
- float3 w1 = vload3(0, (__global float *)(src_addr + 1 * src_stride_y));
- float3 w2 = vload3(0, (__global float *)(src_addr + 2 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
// Row 0
- float8 out0 = 0.0f;
- out0.s0 = (w0.s0) / 16.f;
- out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
- out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
- out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
- out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
- out0.s5 = (w0.s2) / 4.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 = 0.0f;
+ out0.s0 = (w0.s0) / 16.f;
+ out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
+ out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
+ out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+ out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+ out0.s5 = (w0.s2) / 4.f;
#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
// Row 1
- float8 out1 = 0.0f;
- out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
- out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
- out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
- out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
- out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
- out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out1 = 0.0f;
+ out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
+ out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+ out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+ out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+ out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+ out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
// Row 2
- float8 out2 = 0.0f;
- out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
- out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
- out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
- out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
- out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
- out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out2 = 0.0f;
+ out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
+ out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+ out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+ out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+ out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+ out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
// Row 3
- float8 out3 = 0.0f;
- out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
- out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
- out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
- out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
- out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
- out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out3 = 0.0f;
+ out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+ out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
// Row 4
- float8 out4 = 0.0f;
- out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
- out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
- out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
- out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
- out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
- out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out4 = 0.0f;
+ out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+ out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
// Row 5
- float8 out5 = 0.0f;
- out5.s0 = (w2.s0) / 4.f;
- out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
- out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
- out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
- out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
- out5.s5 = (w2.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out5 = 0.0f;
+ out5.s0 = (w2.s0) / 4.f;
+ out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
+ out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
+ out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+ out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+ out5.s5 = (w2.s2);
#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
int z = get_global_id(2);
@@ -244,44 +266,44 @@
// Store the values across the channels
// 36 channels for 3x3 kernels
// 6 channels for 3x1 or 1x3 kernels
- *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0;
- *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1;
- *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2;
- *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3;
- *(__global float *)(dst_addr + 4 * dst_stride_z) = out0.s4;
- *(__global float *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- *(__global float *)(dst_addr + 6 * dst_stride_z) = out1.s0;
- *(__global float *)(dst_addr + 7 * dst_stride_z) = out1.s1;
- *(__global float *)(dst_addr + 8 * dst_stride_z) = out1.s2;
- *(__global float *)(dst_addr + 9 * dst_stride_z) = out1.s3;
- *(__global float *)(dst_addr + 10 * dst_stride_z) = out1.s4;
- *(__global float *)(dst_addr + 11 * dst_stride_z) = out1.s5;
- *(__global float *)(dst_addr + 12 * dst_stride_z) = out2.s0;
- *(__global float *)(dst_addr + 13 * dst_stride_z) = out2.s1;
- *(__global float *)(dst_addr + 14 * dst_stride_z) = out2.s2;
- *(__global float *)(dst_addr + 15 * dst_stride_z) = out2.s3;
- *(__global float *)(dst_addr + 16 * dst_stride_z) = out2.s4;
- *(__global float *)(dst_addr + 17 * dst_stride_z) = out2.s5;
- *(__global float *)(dst_addr + 18 * dst_stride_z) = out3.s0;
- *(__global float *)(dst_addr + 19 * dst_stride_z) = out3.s1;
- *(__global float *)(dst_addr + 20 * dst_stride_z) = out3.s2;
- *(__global float *)(dst_addr + 21 * dst_stride_z) = out3.s3;
- *(__global float *)(dst_addr + 22 * dst_stride_z) = out3.s4;
- *(__global float *)(dst_addr + 23 * dst_stride_z) = out3.s5;
- *(__global float *)(dst_addr + 24 * dst_stride_z) = out4.s0;
- *(__global float *)(dst_addr + 25 * dst_stride_z) = out4.s1;
- *(__global float *)(dst_addr + 26 * dst_stride_z) = out4.s2;
- *(__global float *)(dst_addr + 27 * dst_stride_z) = out4.s3;
- *(__global float *)(dst_addr + 28 * dst_stride_z) = out4.s4;
- *(__global float *)(dst_addr + 29 * dst_stride_z) = out4.s5;
- *(__global float *)(dst_addr + 30 * dst_stride_z) = out5.s0;
- *(__global float *)(dst_addr + 31 * dst_stride_z) = out5.s1;
- *(__global float *)(dst_addr + 32 * dst_stride_z) = out5.s2;
- *(__global float *)(dst_addr + 33 * dst_stride_z) = out5.s3;
- *(__global float *)(dst_addr + 34 * dst_stride_z) = out5.s4;
- *(__global float *)(dst_addr + 35 * dst_stride_z) = out5.s5;
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s0;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s1;
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s2;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s3;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
+ *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
+ *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
+ *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
+ *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
+ *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
+ *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
+ *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
+ *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
+ *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
+ *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
+ *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
+ *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
+ *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
+ *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
+ *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
+ *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
+ *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
+ *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
+ *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
+ *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
}
@@ -290,8 +312,9 @@
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
* @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -320,25 +343,25 @@
// Load the values from the input tensor
#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- float w00 = *((__global float *)(src_addr + 0 * src_stride_z));
- float w01 = *((__global float *)(src_addr + 1 * src_stride_z));
- float w02 = *((__global float *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- float w00 = *((__global float *)(src_addr + 0 * src_stride_z + 0 * src_stride_y));
- float w01 = *((__global float *)(src_addr + 0 * src_stride_z + 1 * src_stride_y));
- float w02 = *((__global float *)(src_addr + 0 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 2 * src_stride_y));
#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
- float w10 = *((__global float *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
- float w11 = *((__global float *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
- float w12 = *((__global float *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
- float w20 = *((__global float *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
- float w21 = *((__global float *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
- float w22 = *((__global float *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
// Row 0
- float out00, out01, out02, out03, out04, out05;
+ DATA_TYPE out00, out01, out02, out03, out04, out05;
out00 = (w00) / 16.f;
out01 = (-w00 - w01 - w02) / 24.f;
out02 = (-w00 + w01 - w02) / 24.f;
@@ -348,7 +371,7 @@
#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
// Row 1
- float out10, out11, out12, out13, out14, out15;
+ DATA_TYPE out10, out11, out12, out13, out14, out15;
out10 = (-w00 - w10 - w20) / 24.f;
out11 = (w00 + w10 + w20 + w01 + w11 + w21 + w02 + w12 + w22) / 36.f;
out12 = (w00 + w10 + w20 - w01 - w11 - w21 + w02 + w12 + w22) / 36.f;
@@ -357,7 +380,7 @@
out15 = (-w02 - w12 - w22) / 6.f;
// Row 2
- float out20, out21, out22, out23, out24, out25;
+ DATA_TYPE out20, out21, out22, out23, out24, out25;
out20 = (-w00 + w10 - w20) / 24.f;
out21 = (w00 - w10 + w20 + w01 - w11 + w21 + w02 - w12 + w22) / 36.f;
out22 = (w00 - w10 + w20 - w01 + w11 - w21 + w02 - w12 + w22) / 36.f;
@@ -366,7 +389,7 @@
out25 = (-w02 + w12 - w22) / 6.f;
// Row 3
- float out30, out31, out32, out33, out34, out35;
+ DATA_TYPE out30, out31, out32, out33, out34, out35;
out30 = (w00 + 2.f * w10 + 4.f * w20) / 96.f;
out31 = (-w00 - 2.f * w10 - 4.f * w20 - w01 - 2.f * w11 - 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
out32 = (-w00 - 2.f * w10 - 4.f * w20 + w01 + 2.f * w11 + 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
@@ -375,7 +398,7 @@
out35 = (w02 + 2.f * w12 + 4.f * w22) / 24.f;
// Row 4
- float out40, out41, out42, out43, out44, out45;
+ DATA_TYPE out40, out41, out42, out43, out44, out45;
out40 = (w00 - 2.f * w10 + 4.f * w20) / 96.f;
out41 = (-w00 + 2.f * w10 - 4.f * w20 - w01 + 2.f * w11 - 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
out42 = (-w00 + 2.f * w10 - 4.f * w20 + w01 - 2.f * w11 + 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
@@ -384,7 +407,7 @@
out45 = (w02 - 2.f * w12 + 4.f * w22) / 24.f;
// Row 5
- float out50, out51, out52, out53, out54, out55;
+ DATA_TYPE out50, out51, out52, out53, out54, out55;
out50 = (w20) / 4.f;
out51 = (-w20 - w21 - w22) / 6.f;
out52 = (-w20 + w21 - w22) / 6.f;
@@ -397,48 +420,48 @@
int y0 = get_global_id(0); // idx channel
// Get output address
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(float) + y0 * dst_stride_y;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
// Store the values across the channels
// 36 channels for 3x3 kernels
// 6 channels for 3x1 or 1x3 kernels
- *(__global float *)(dst_addr + 0 * dst_stride_z) = out00;
- *(__global float *)(dst_addr + 1 * dst_stride_z) = out01;
- *(__global float *)(dst_addr + 2 * dst_stride_z) = out02;
- *(__global float *)(dst_addr + 3 * dst_stride_z) = out03;
- *(__global float *)(dst_addr + 4 * dst_stride_z) = out04;
- *(__global float *)(dst_addr + 5 * dst_stride_z) = out05;
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out00;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out01;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out02;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out03;
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out04;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out05;
#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- *(__global float *)(dst_addr + 6 * dst_stride_z) = out10;
- *(__global float *)(dst_addr + 7 * dst_stride_z) = out11;
- *(__global float *)(dst_addr + 8 * dst_stride_z) = out12;
- *(__global float *)(dst_addr + 9 * dst_stride_z) = out13;
- *(__global float *)(dst_addr + 10 * dst_stride_z) = out14;
- *(__global float *)(dst_addr + 11 * dst_stride_z) = out15;
- *(__global float *)(dst_addr + 12 * dst_stride_z) = out20;
- *(__global float *)(dst_addr + 13 * dst_stride_z) = out21;
- *(__global float *)(dst_addr + 14 * dst_stride_z) = out22;
- *(__global float *)(dst_addr + 15 * dst_stride_z) = out23;
- *(__global float *)(dst_addr + 16 * dst_stride_z) = out24;
- *(__global float *)(dst_addr + 17 * dst_stride_z) = out25;
- *(__global float *)(dst_addr + 18 * dst_stride_z) = out30;
- *(__global float *)(dst_addr + 19 * dst_stride_z) = out31;
- *(__global float *)(dst_addr + 20 * dst_stride_z) = out32;
- *(__global float *)(dst_addr + 21 * dst_stride_z) = out33;
- *(__global float *)(dst_addr + 22 * dst_stride_z) = out34;
- *(__global float *)(dst_addr + 23 * dst_stride_z) = out35;
- *(__global float *)(dst_addr + 24 * dst_stride_z) = out40;
- *(__global float *)(dst_addr + 25 * dst_stride_z) = out41;
- *(__global float *)(dst_addr + 26 * dst_stride_z) = out42;
- *(__global float *)(dst_addr + 27 * dst_stride_z) = out43;
- *(__global float *)(dst_addr + 28 * dst_stride_z) = out44;
- *(__global float *)(dst_addr + 29 * dst_stride_z) = out45;
- *(__global float *)(dst_addr + 30 * dst_stride_z) = out50;
- *(__global float *)(dst_addr + 31 * dst_stride_z) = out51;
- *(__global float *)(dst_addr + 32 * dst_stride_z) = out52;
- *(__global float *)(dst_addr + 33 * dst_stride_z) = out53;
- *(__global float *)(dst_addr + 34 * dst_stride_z) = out54;
- *(__global float *)(dst_addr + 35 * dst_stride_z) = out55;
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out10;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out11;
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out12;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out13;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out14;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out15;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out20;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out21;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out22;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out23;
+ *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out24;
+ *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out25;
+ *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out30;
+ *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out31;
+ *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out32;
+ *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out33;
+ *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out34;
+ *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out35;
+ *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out40;
+ *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out41;
+ *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out42;
+ *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out43;
+ *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out44;
+ *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out45;
+ *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out50;
+ *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out51;
+ *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out52;
+ *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out53;
+ *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out54;
+ *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out55;
#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
}
@@ -448,8 +471,9 @@
*
* @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -478,177 +502,192 @@
// Load the values from the input tensor
#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
- float4 w00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
- float w01 = *((__global float *)(src_addr + 0 * src_stride_y) + 4);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- float4 w00 = (float4)(*((__global float *)(src_addr + 0 * src_stride_y)),
- *((__global float *)(src_addr + 1 * src_stride_y)),
- *((__global float *)(src_addr + 2 * src_stride_y)),
- *((__global float *)(src_addr + 3 * src_stride_y)));
- float w01 = *((__global float *)(src_addr + 4 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- float4 w00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
- float w01 = *((__global float *)(src_addr + 0 * src_stride_y) + 4);
- float4 w10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
- float w11 = *((__global float *)(src_addr + 1 * src_stride_y) + 4);
- float4 w20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
- float w21 = *((__global float *)(src_addr + 2 * src_stride_y) + 4);
- float4 w30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
- float w31 = *((__global float *)(src_addr + 3 * src_stride_y) + 4);
- float4 w40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
- float w41 = *((__global float *)(src_addr + 4 * src_stride_y) + 4);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+ DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+ DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
// Transform the input tile
// Row 0
- float8 out0 = 0.0f;
- out0.s0 = w00.s0;
- out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
- out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
- out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
- out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
- out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
- out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
- out0.s7 = w01;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 = 0.0f;
+ out0.s0 = w00.s0;
+ out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
+ out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
+ out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
+ out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
+ out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
+ out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
+ out0.s7 = w01;
#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
// Row 1
- float8 out1 = 0.0f;
- out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
- out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
- (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
- out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
- (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
- out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
- (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
- out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
- (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
- out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
- (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
- out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
- (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
- out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out1 = 0.0f;
+ out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
+ out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+ out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+ out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+ out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+ out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+ out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+ out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
// Row 2
- float8 out2 = 0.0f;
- out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
- out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
- (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
- out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
- (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
- out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
- (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
- out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
- (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
- out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
- (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
- out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
- (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
- out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out2 = 0.0f;
+ out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
+ out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+ out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+ out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+ out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+ out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+ out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+ out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
// Row 3
- float8 out3 = 0.0f;
- out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
- out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
- (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
- out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
- (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
- out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
- (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
- out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
- (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
- out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
- out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
- out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out3 = 0.0f;
+ out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+ out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+ out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+ out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+ out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+ out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+ out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+ out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
// Row 4
- float8 out4 = 0.0f;
- out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
- out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
- (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
- out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
- (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
- out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
- (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
- out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
- (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
- out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
- out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
- out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out4 = 0.0f;
+ out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+ out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+ out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+ out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+ out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+ out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+ out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+ out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
// Row 5
- float8 out5 = 0.0f;
- out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
- out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
- (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
- (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
- out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
- (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
- (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
- out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
- (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
- out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
- (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
- out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
- (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
- out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
- (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
- out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out5 = 0.0f;
+ out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
+ out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+ out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+ out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+ out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+ out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+ out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+ out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
// Row 6
- float8 out6 = 0.0f;
- out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
- out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
- (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
- (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
- out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
- (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
- (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
- out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
- (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
- out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
- (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
- out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
- (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
- out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
- (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
- out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out6 = 0.0f;
+ out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
+ out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+ out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+ out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+ out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+ out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+ out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+ out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
// Row 7
- float8 out7 = 0.0f;
- out7.s0 = w40.s0;
- out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
- out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
- out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
- out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
- out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
- out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
- out7.s7 = w41;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out7 = 0.0f;
+ out7.s0 = w40.s0;
+ out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
+ out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
+ out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
+ out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
+ out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
+ out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
+ out7.s7 = w41;
#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
int z = get_global_id(2);
@@ -656,75 +695,75 @@
int y0 = z % SRC_DIM_Z; // idx channel
// Get output address
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(float) + y0 * dst_stride_y;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
// Store the values across the channels
- *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0;
- *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1;
- *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2;
- *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3;
- *(__global float *)(dst_addr + 4 * dst_stride_z) = out0.s4;
- *(__global float *)(dst_addr + 5 * dst_stride_z) = out0.s5;
- *(__global float *)(dst_addr + 6 * dst_stride_z) = out0.s6;
- *(__global float *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- *(__global float *)(dst_addr + 8 * dst_stride_z) = out1.s0;
- *(__global float *)(dst_addr + 9 * dst_stride_z) = out1.s1;
- *(__global float *)(dst_addr + 10 * dst_stride_z) = out1.s2;
- *(__global float *)(dst_addr + 11 * dst_stride_z) = out1.s3;
- *(__global float *)(dst_addr + 12 * dst_stride_z) = out1.s4;
- *(__global float *)(dst_addr + 13 * dst_stride_z) = out1.s5;
- *(__global float *)(dst_addr + 14 * dst_stride_z) = out1.s6;
- *(__global float *)(dst_addr + 15 * dst_stride_z) = out1.s7;
- *(__global float *)(dst_addr + 16 * dst_stride_z) = out2.s0;
- *(__global float *)(dst_addr + 17 * dst_stride_z) = out2.s1;
- *(__global float *)(dst_addr + 18 * dst_stride_z) = out2.s2;
- *(__global float *)(dst_addr + 19 * dst_stride_z) = out2.s3;
- *(__global float *)(dst_addr + 20 * dst_stride_z) = out2.s4;
- *(__global float *)(dst_addr + 21 * dst_stride_z) = out2.s5;
- *(__global float *)(dst_addr + 22 * dst_stride_z) = out2.s6;
- *(__global float *)(dst_addr + 23 * dst_stride_z) = out2.s7;
- *(__global float *)(dst_addr + 24 * dst_stride_z) = out3.s0;
- *(__global float *)(dst_addr + 25 * dst_stride_z) = out3.s1;
- *(__global float *)(dst_addr + 26 * dst_stride_z) = out3.s2;
- *(__global float *)(dst_addr + 27 * dst_stride_z) = out3.s3;
- *(__global float *)(dst_addr + 28 * dst_stride_z) = out3.s4;
- *(__global float *)(dst_addr + 29 * dst_stride_z) = out3.s5;
- *(__global float *)(dst_addr + 30 * dst_stride_z) = out3.s6;
- *(__global float *)(dst_addr + 31 * dst_stride_z) = out3.s7;
- *(__global float *)(dst_addr + 32 * dst_stride_z) = out4.s0;
- *(__global float *)(dst_addr + 33 * dst_stride_z) = out4.s1;
- *(__global float *)(dst_addr + 34 * dst_stride_z) = out4.s2;
- *(__global float *)(dst_addr + 35 * dst_stride_z) = out4.s3;
- *(__global float *)(dst_addr + 36 * dst_stride_z) = out4.s4;
- *(__global float *)(dst_addr + 37 * dst_stride_z) = out4.s5;
- *(__global float *)(dst_addr + 38 * dst_stride_z) = out4.s6;
- *(__global float *)(dst_addr + 39 * dst_stride_z) = out4.s7;
- *(__global float *)(dst_addr + 40 * dst_stride_z) = out5.s0;
- *(__global float *)(dst_addr + 41 * dst_stride_z) = out5.s1;
- *(__global float *)(dst_addr + 42 * dst_stride_z) = out5.s2;
- *(__global float *)(dst_addr + 43 * dst_stride_z) = out5.s3;
- *(__global float *)(dst_addr + 44 * dst_stride_z) = out5.s4;
- *(__global float *)(dst_addr + 45 * dst_stride_z) = out5.s5;
- *(__global float *)(dst_addr + 46 * dst_stride_z) = out5.s6;
- *(__global float *)(dst_addr + 47 * dst_stride_z) = out5.s7;
- *(__global float *)(dst_addr + 48 * dst_stride_z) = out6.s0;
- *(__global float *)(dst_addr + 49 * dst_stride_z) = out6.s1;
- *(__global float *)(dst_addr + 50 * dst_stride_z) = out6.s2;
- *(__global float *)(dst_addr + 51 * dst_stride_z) = out6.s3;
- *(__global float *)(dst_addr + 52 * dst_stride_z) = out6.s4;
- *(__global float *)(dst_addr + 53 * dst_stride_z) = out6.s5;
- *(__global float *)(dst_addr + 54 * dst_stride_z) = out6.s6;
- *(__global float *)(dst_addr + 55 * dst_stride_z) = out6.s7;
- *(__global float *)(dst_addr + 56 * dst_stride_z) = out7.s0;
- *(__global float *)(dst_addr + 57 * dst_stride_z) = out7.s1;
- *(__global float *)(dst_addr + 58 * dst_stride_z) = out7.s2;
- *(__global float *)(dst_addr + 59 * dst_stride_z) = out7.s3;
- *(__global float *)(dst_addr + 60 * dst_stride_z) = out7.s4;
- *(__global float *)(dst_addr + 61 * dst_stride_z) = out7.s5;
- *(__global float *)(dst_addr + 62 * dst_stride_z) = out7.s6;
- *(__global float *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+ *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+ *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+ *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+ *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+ *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+ *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+ *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+ *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+ *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+ *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+ *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+ *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+ *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+ *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+ *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+ *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+ *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+ *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+ *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+ *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+ *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+ *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+ *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+ *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+ *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+ *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+ *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+ *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+ *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+ *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+ *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+ *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+ *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+ *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+ *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+ *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+ *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+ *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+ *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+ *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+ *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+ *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+ *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+ *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+ *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+ *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+ *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+ *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
}
@@ -733,8 +772,9 @@
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
* @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -759,242 +799,250 @@
{
Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
- const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(float) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+ const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
// Load the values from the input tensor
- float w00 = *((__global float *)(src_addr + 0 * src_stride_z));
- float w01 = *((__global float *)(src_addr + 1 * src_stride_z));
- float w02 = *((__global float *)(src_addr + 2 * src_stride_z));
- float w03 = *((__global float *)(src_addr + 3 * src_stride_z));
- float w04 = *((__global float *)(src_addr + 4 * src_stride_z));
+ DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+ DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
// Load the values from the input tensor
- float w00 = *((__global float *)(src_addr + 0 * src_stride_y));
- float w01 = *((__global float *)(src_addr + 1 * src_stride_y));
- float w02 = *((__global float *)(src_addr + 2 * src_stride_y));
- float w03 = *((__global float *)(src_addr + 3 * src_stride_y));
- float w04 = *((__global float *)(src_addr + 4 * src_stride_y));
+ DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+ DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- float w10 = *((__global float *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
- float w11 = *((__global float *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
- float w12 = *((__global float *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
- float w13 = *((__global float *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
- float w14 = *((__global float *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
- float w20 = *((__global float *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
- float w21 = *((__global float *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
- float w22 = *((__global float *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
- float w23 = *((__global float *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
- float w24 = *((__global float *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
- float w30 = *((__global float *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
- float w31 = *((__global float *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
- float w32 = *((__global float *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
- float w33 = *((__global float *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
- float w34 = *((__global float *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
- float w40 = *((__global float *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
- float w41 = *((__global float *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
- float w42 = *((__global float *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
- float w43 = *((__global float *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
- float w44 = *((__global float *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
+ DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
+ DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
+ DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
+ DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
+ DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
// Row 0
- float8 out0 = 0.0f;
- out0.s0 = w00;
- out0.s1 = -2.f * (w00 + w01 + w02 + w03 + w04) / 9.f;
- out0.s2 = -2.f * (w00 - w01 + w02 - w03 + w04) / 9.f;
- out0.s3 = (w00 + 2.f * w01 + 4.f * w02 + 8.f * w03 + 16.f * w04) / 90.f;
- out0.s4 = (w00 - 2.f * w01 + 4.f * w02 - 8.f * w03 + 16.f * w04) / 90.f;
- out0.s5 = (16.f * w00 + 8.f * w01 + 4.f * w02 + 2.f * w03 + w04) / 180.f;
- out0.s6 = (16.f * w00 - 8.f * w01 + 4.f * w02 - 2.f * w03 + w04) / 180.f;
- out0.s7 = w04;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 = 0.0f;
+ out0.s0 = w00;
+ out0.s1 = -2.f * (w00 + w01 + w02 + w03 + w04) / 9.f;
+ out0.s2 = -2.f * (w00 - w01 + w02 - w03 + w04) / 9.f;
+ out0.s3 = (w00 + 2.f * w01 + 4.f * w02 + 8.f * w03 + 16.f * w04) / 90.f;
+ out0.s4 = (w00 - 2.f * w01 + 4.f * w02 - 8.f * w03 + 16.f * w04) / 90.f;
+ out0.s5 = (16.f * w00 + 8.f * w01 + 4.f * w02 + 2.f * w03 + w04) / 180.f;
+ out0.s6 = (16.f * w00 - 8.f * w01 + 4.f * w02 - 2.f * w03 + w04) / 180.f;
+ out0.s7 = w04;
#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
// Row 1
- float8 out1 = 0.0f;
- out1.s0 = -2.f * (w00 + w10 + w20 + w30 + w40) / 9.f;
- out1.s1 = 4.f * ((w00 + w10 + w20 + w30 + w40) + (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) + (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
- out1.s2 = 4.f * ((w00 + w10 + w20 + w30 + w40) - (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) - (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
- out1.s3 = -((w00 + w10 + w20 + w30 + w40) + 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
- (w04 + w14 + w24 + w34 + w44)) / 405.f;
- out1.s4 = -((w00 + w10 + w20 + w30 + w40) - 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
- (w04 + w14 + w24 + w34 + w44)) / 405.f;
- out1.s5 = -(16.f * (w00 + w10 + w20 + w30 + w40) + 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 2.f * (w03 + w13 + w23 + w33 + w43) +
- (w04 + w14 + w24 + w34 + w44)) / 810.f;
- out1.s6 = -(16.f * (w00 + w10 + w20 + w30 + w40) - 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 2.f * (w03 + w13 + w23 + w33 + w43) +
- (w04 + w14 + w24 + w34 + w44)) / 810.f;
- out1.s7 = -2.f * (w04 + w14 + w24 + w34 + w44) / 9.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out1 = 0.0f;
+ out1.s0 = -2.f * (w00 + w10 + w20 + w30 + w40) / 9.f;
+ out1.s1 = 4.f * ((w00 + w10 + w20 + w30 + w40) + (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) + (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
+ out1.s2 = 4.f * ((w00 + w10 + w20 + w30 + w40) - (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) - (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
+ out1.s3 = -((w00 + w10 + w20 + w30 + w40) + 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
+ (w04 + w14 + w24 + w34 + w44)) / 405.f;
+ out1.s4 = -((w00 + w10 + w20 + w30 + w40) - 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
+ (w04 + w14 + w24 + w34 + w44)) / 405.f;
+ out1.s5 = -(16.f * (w00 + w10 + w20 + w30 + w40) + 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 2.f * (w03 + w13 + w23 + w33 + w43) +
+ (w04 + w14 + w24 + w34 + w44)) / 810.f;
+ out1.s6 = -(16.f * (w00 + w10 + w20 + w30 + w40) - 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 2.f * (w03 + w13 + w23 + w33 + w43) +
+ (w04 + w14 + w24 + w34 + w44)) / 810.f;
+ out1.s7 = -2.f * (w04 + w14 + w24 + w34 + w44) / 9.f;
// Row 2
- float8 out2 = 0.0f;
- out2.s0 = -2.f * (w00 - w10 + w20 - w30 + w40) / 9.f;
- out2.s1 = 4.f * ((w00 - w10 + w20 - w30 + w40) + (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) + (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
- out2.s2 = 4.f * ((w00 - w10 + w20 - w30 + w40) - (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) - (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
- out2.s3 = -((w00 - w10 + w20 - w30 + w40) + 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
- (w04 - w14 + w24 - w34 + w44)) / 405.f;
- out2.s4 = -((w00 - w10 + w20 - w30 + w40) - 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
- (w04 - w14 + w24 - w34 + w44)) / 405.f;
- out2.s5 = -(16.f * (w00 - w10 + w20 - w30 + w40) + 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 2.f * (w03 - w13 + w23 - w33 + w43) +
- (w04 - w14 + w24 - w34 + w44)) / 810.f;
- out2.s6 = -(16.f * (w00 - w10 + w20 - w30 + w40) - 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 2.f * (w03 - w13 + w23 - w33 + w43) +
- (w04 - w14 + w24 - w34 + w44)) / 810.f;
- out2.s7 = -2.f * (w04 - w14 + w24 - w34 + w44) / 9.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out2 = 0.0f;
+ out2.s0 = -2.f * (w00 - w10 + w20 - w30 + w40) / 9.f;
+ out2.s1 = 4.f * ((w00 - w10 + w20 - w30 + w40) + (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) + (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
+ out2.s2 = 4.f * ((w00 - w10 + w20 - w30 + w40) - (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) - (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
+ out2.s3 = -((w00 - w10 + w20 - w30 + w40) + 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
+ (w04 - w14 + w24 - w34 + w44)) / 405.f;
+ out2.s4 = -((w00 - w10 + w20 - w30 + w40) - 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
+ (w04 - w14 + w24 - w34 + w44)) / 405.f;
+ out2.s5 = -(16.f * (w00 - w10 + w20 - w30 + w40) + 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 2.f * (w03 - w13 + w23 - w33 + w43) +
+ (w04 - w14 + w24 - w34 + w44)) / 810.f;
+ out2.s6 = -(16.f * (w00 - w10 + w20 - w30 + w40) - 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 2.f * (w03 - w13 + w23 - w33 + w43) +
+ (w04 - w14 + w24 - w34 + w44)) / 810.f;
+ out2.s7 = -2.f * (w04 - w14 + w24 - w34 + w44) / 9.f;
// Row 3
- float8 out3 = 0.0f;
- out3.s0 = (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) / 90.f;
- out3.s1 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) +
- (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
- out3.s2 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) -
- (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
- out3.s3 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
- (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 8.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
- out3.s4 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
- (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 8.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
- out3.s5 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
- (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
- out3.s6 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
- (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
- out3.s7 = (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44) / 90.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out3 = 0.0f;
+ out3.s0 = (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) / 90.f;
+ out3.s1 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) +
+ (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
+ out3.s2 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) -
+ (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
+ out3.s3 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 8.f
+ * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
+ out3.s4 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 8.f
+ * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
+ out3.s5 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
+ (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
+ out3.s6 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
+ (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
+ out3.s7 = (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44) / 90.f;
// Row 4
- float8 out4 = 0.0f;
- out4.s0 = (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) / 90.f;
- out4.s1 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) +
- (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
- out4.s2 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) -
- (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
- out4.s3 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
- (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 8.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
- out4.s4 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
- (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 8.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
- out4.s5 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
- (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
- out4.s6 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
- (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
- out4.s7 = (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44) / 90.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out4 = 0.0f;
+ out4.s0 = (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) / 90.f;
+ out4.s1 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) +
+ (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
+ out4.s2 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) -
+ (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
+ out4.s3 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 8.f
+ * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
+ out4.s4 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 8.f
+ * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
+ out4.s5 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
+ (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
+ out4.s6 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
+ (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
+ out4.s7 = (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44) / 90.f;
// Row 5
- float8 out5 = 0.0f;
- out5.s0 = (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) / 180.f;
- out5.s1 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) +
- (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
- out5.s2 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) -
- (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
- out5.s3 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
- (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 8.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
- out5.s4 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
- (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 8.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
- out5.s5 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
- (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
- out5.s6 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
- (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
- out5.s7 = (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44) / 180.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out5 = 0.0f;
+ out5.s0 = (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) / 180.f;
+ out5.s1 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) +
+ (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
+ out5.s2 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) -
+ (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
+ out5.s3 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 8.f
+ * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
+ out5.s4 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 8.f
+ * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
+ out5.s5 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
+ (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
+ out5.s6 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
+ (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
+ out5.s7 = (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44) / 180.f;
// Row 6
- float8 out6 = 0.0f;
- out6.s0 = (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) / 180.f;
- out6.s1 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) +
- (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
- out6.s2 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) -
- (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
- out6.s3 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
- (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 8.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
- out6.s4 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
- (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 8.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
- out6.s5 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
- (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
- out6.s6 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
- (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
- out6.s7 = (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44) / 180.f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out6 = 0.0f;
+ out6.s0 = (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) / 180.f;
+ out6.s1 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) +
+ (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
+ out6.s2 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) -
+ (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
+ out6.s3 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 8.f
+ * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
+ out6.s4 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 8.f
+ * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
+ out6.s5 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
+ (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
+ out6.s6 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
+ (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
+ out6.s7 = (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44) / 180.f;
// Row 7
- float8 out7 = 0.0f;
- out7.s0 = w40;
- out7.s1 = -2.f * (w40 + w41 + w42 + w43 + w44) / 9.f;
- out7.s2 = -2.f * (w40 - w41 + w42 - w43 + w44) / 9.f;
- out7.s3 = (w40 + 2.f * w41 + 4.f * w42 + 8.f * w43 + 16.f * w44) / 90.f;
- out7.s4 = (w40 - 2.f * w41 + 4.f * w42 - 8.f * w43 + 16.f * w44) / 90.f;
- out7.s5 = (16.f * w40 + 8.f * w41 + 4.f * w42 + 2.f * w43 + w44) / 180.f;
- out7.s6 = (16.f * w40 - 8.f * w41 + 4.f * w42 - 2.f * w43 + w44) / 180.f;
- out7.s7 = w44;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out7 = 0.0f;
+ out7.s0 = w40;
+ out7.s1 = -2.f * (w40 + w41 + w42 + w43 + w44) / 9.f;
+ out7.s2 = -2.f * (w40 - w41 + w42 - w43 + w44) / 9.f;
+ out7.s3 = (w40 + 2.f * w41 + 4.f * w42 + 8.f * w43 + 16.f * w44) / 90.f;
+ out7.s4 = (w40 - 2.f * w41 + 4.f * w42 - 8.f * w43 + 16.f * w44) / 90.f;
+ out7.s5 = (16.f * w40 + 8.f * w41 + 4.f * w42 + 2.f * w43 + w44) / 180.f;
+ out7.s6 = (16.f * w40 - 8.f * w41 + 4.f * w42 - 2.f * w43 + w44) / 180.f;
+ out7.s7 = w44;
#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
int x0 = get_global_id(2); // idx filter
int y0 = get_global_id(0); // idx channel
// Get output address
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(float) + y0 * dst_stride_y;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
// Store the values across the channels
- *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0;
- *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1;
- *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2;
- *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3;
- *(__global float *)(dst_addr + 4 * dst_stride_z) = out0.s4;
- *(__global float *)(dst_addr + 5 * dst_stride_z) = out0.s5;
- *(__global float *)(dst_addr + 6 * dst_stride_z) = out0.s6;
- *(__global float *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- *(__global float *)(dst_addr + 8 * dst_stride_z) = out1.s0;
- *(__global float *)(dst_addr + 9 * dst_stride_z) = out1.s1;
- *(__global float *)(dst_addr + 10 * dst_stride_z) = out1.s2;
- *(__global float *)(dst_addr + 11 * dst_stride_z) = out1.s3;
- *(__global float *)(dst_addr + 12 * dst_stride_z) = out1.s4;
- *(__global float *)(dst_addr + 13 * dst_stride_z) = out1.s5;
- *(__global float *)(dst_addr + 14 * dst_stride_z) = out1.s6;
- *(__global float *)(dst_addr + 15 * dst_stride_z) = out1.s7;
- *(__global float *)(dst_addr + 16 * dst_stride_z) = out2.s0;
- *(__global float *)(dst_addr + 17 * dst_stride_z) = out2.s1;
- *(__global float *)(dst_addr + 18 * dst_stride_z) = out2.s2;
- *(__global float *)(dst_addr + 19 * dst_stride_z) = out2.s3;
- *(__global float *)(dst_addr + 20 * dst_stride_z) = out2.s4;
- *(__global float *)(dst_addr + 21 * dst_stride_z) = out2.s5;
- *(__global float *)(dst_addr + 22 * dst_stride_z) = out2.s6;
- *(__global float *)(dst_addr + 23 * dst_stride_z) = out2.s7;
- *(__global float *)(dst_addr + 24 * dst_stride_z) = out3.s0;
- *(__global float *)(dst_addr + 25 * dst_stride_z) = out3.s1;
- *(__global float *)(dst_addr + 26 * dst_stride_z) = out3.s2;
- *(__global float *)(dst_addr + 27 * dst_stride_z) = out3.s3;
- *(__global float *)(dst_addr + 28 * dst_stride_z) = out3.s4;
- *(__global float *)(dst_addr + 29 * dst_stride_z) = out3.s5;
- *(__global float *)(dst_addr + 30 * dst_stride_z) = out3.s6;
- *(__global float *)(dst_addr + 31 * dst_stride_z) = out3.s7;
- *(__global float *)(dst_addr + 32 * dst_stride_z) = out4.s0;
- *(__global float *)(dst_addr + 33 * dst_stride_z) = out4.s1;
- *(__global float *)(dst_addr + 34 * dst_stride_z) = out4.s2;
- *(__global float *)(dst_addr + 35 * dst_stride_z) = out4.s3;
- *(__global float *)(dst_addr + 36 * dst_stride_z) = out4.s4;
- *(__global float *)(dst_addr + 37 * dst_stride_z) = out4.s5;
- *(__global float *)(dst_addr + 38 * dst_stride_z) = out4.s6;
- *(__global float *)(dst_addr + 39 * dst_stride_z) = out4.s7;
- *(__global float *)(dst_addr + 40 * dst_stride_z) = out5.s0;
- *(__global float *)(dst_addr + 41 * dst_stride_z) = out5.s1;
- *(__global float *)(dst_addr + 42 * dst_stride_z) = out5.s2;
- *(__global float *)(dst_addr + 43 * dst_stride_z) = out5.s3;
- *(__global float *)(dst_addr + 44 * dst_stride_z) = out5.s4;
- *(__global float *)(dst_addr + 45 * dst_stride_z) = out5.s5;
- *(__global float *)(dst_addr + 46 * dst_stride_z) = out5.s6;
- *(__global float *)(dst_addr + 47 * dst_stride_z) = out5.s7;
- *(__global float *)(dst_addr + 48 * dst_stride_z) = out6.s0;
- *(__global float *)(dst_addr + 49 * dst_stride_z) = out6.s1;
- *(__global float *)(dst_addr + 50 * dst_stride_z) = out6.s2;
- *(__global float *)(dst_addr + 51 * dst_stride_z) = out6.s3;
- *(__global float *)(dst_addr + 52 * dst_stride_z) = out6.s4;
- *(__global float *)(dst_addr + 53 * dst_stride_z) = out6.s5;
- *(__global float *)(dst_addr + 54 * dst_stride_z) = out6.s6;
- *(__global float *)(dst_addr + 55 * dst_stride_z) = out6.s7;
- *(__global float *)(dst_addr + 56 * dst_stride_z) = out7.s0;
- *(__global float *)(dst_addr + 57 * dst_stride_z) = out7.s1;
- *(__global float *)(dst_addr + 58 * dst_stride_z) = out7.s2;
- *(__global float *)(dst_addr + 59 * dst_stride_z) = out7.s3;
- *(__global float *)(dst_addr + 60 * dst_stride_z) = out7.s4;
- *(__global float *)(dst_addr + 61 * dst_stride_z) = out7.s5;
- *(__global float *)(dst_addr + 62 * dst_stride_z) = out7.s6;
- *(__global float *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+ *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+ *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+ *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+ *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+ *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+ *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+ *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+ *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+ *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+ *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+ *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+ *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+ *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+ *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+ *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+ *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+ *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+ *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+ *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+ *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+ *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+ *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+ *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+ *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+ *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+ *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+ *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+ *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+ *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+ *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+ *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+ *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+ *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+ *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+ *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+ *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+ *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+ *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+ *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+ *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+ *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+ *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+ *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+ *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+ *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+ *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+ *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+ *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
}
#endif // defined(SRC_DIM_Z)
@@ -1004,8 +1052,9 @@
*
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
* @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -1052,8 +1101,9 @@
*
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
* @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -1100,8 +1150,9 @@
*
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
* @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -1148,8 +1199,9 @@
*
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
* @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -1196,8 +1248,9 @@
*
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
* @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -1246,8 +1299,9 @@
*
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
* @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -1294,8 +1348,9 @@
*
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
* @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -1342,8 +1397,9 @@
*
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
* @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -1390,8 +1446,9 @@
*
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
* @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -1438,8 +1495,9 @@
*
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
* @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/winograd_input_transform.cl
index da18e4a..34bf290 100644
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_input_transform.cl
@@ -52,8 +52,9 @@
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -69,86 +70,113 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
- int x = get_global_id(0);
- int y = get_global_id(1);
- int z = get_global_id(2);
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+ const int z = get_global_id(2) % SRC_DEPTH;
+ const int b = get_global_id(2) / SRC_DEPTH;
+#else /* defined(SRC_DEPTH) */
+ const int z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
// Compute input address
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(float) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#if defined(SRC_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
- src_addr = src_addr - ((int)PAD_LEFT * sizeof(float)) - ((int)PAD_TOP * src_stride_y);
+ src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
- float4 in_row0 = vload4(0, (__global float *)(src_addr));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
- float4 in_row0 = (float4)(*((__global float *)(src_addr + 0 * src_stride_y)),
- *((__global float *)(src_addr + 1 * src_stride_y)),
- *((__global float *)(src_addr + 2 * src_stride_y)),
- *((__global float *)(src_addr + 3 * src_stride_y)));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- float4 in_row0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
- float4 in_row1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
- float4 in_row2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
- float4 in_row3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- float4 tmp0 = in_row0;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp0 = in_row0;
#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
tmp0 -= in_row2;
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- float out00 = tmp0.s0 - tmp0.s2;
- float out01 = tmp0.s1 + tmp0.s2;
- float out02 = tmp0.s2 - tmp0.s1;
- float out03 = tmp0.s1 - tmp0.s3;
+ DATA_TYPE out00 = tmp0.s0 - tmp0.s2;
+ DATA_TYPE out01 = tmp0.s1 + tmp0.s2;
+ DATA_TYPE out02 = tmp0.s2 - tmp0.s1;
+ DATA_TYPE out03 = tmp0.s1 - tmp0.s3;
#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- float4 tmp1 = in_row1 + in_row2;
- float4 tmp2 = in_row2 - in_row1;
- float4 tmp3 = in_row1 - in_row3;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp1 = in_row1 + in_row2;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp2 = in_row2 - in_row1;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp3 = in_row1 - in_row3;
- float out10 = tmp1.s0 - tmp1.s2;
- float out11 = tmp1.s1 + tmp1.s2;
- float out12 = tmp1.s2 - tmp1.s1;
- float out13 = tmp1.s1 - tmp1.s3;
+ DATA_TYPE out10 = tmp1.s0 - tmp1.s2;
+ DATA_TYPE out11 = tmp1.s1 + tmp1.s2;
+ DATA_TYPE out12 = tmp1.s2 - tmp1.s1;
+ DATA_TYPE out13 = tmp1.s1 - tmp1.s3;
- float out20 = tmp2.s0 - tmp2.s2;
- float out21 = tmp2.s1 + tmp2.s2;
- float out22 = tmp2.s2 - tmp2.s1;
- float out23 = tmp2.s1 - tmp2.s3;
+ DATA_TYPE out20 = tmp2.s0 - tmp2.s2;
+ DATA_TYPE out21 = tmp2.s1 + tmp2.s2;
+ DATA_TYPE out22 = tmp2.s2 - tmp2.s1;
+ DATA_TYPE out23 = tmp2.s1 - tmp2.s3;
- float out30 = tmp3.s0 - tmp3.s2;
- float out31 = tmp3.s1 + tmp3.s2;
- float out32 = tmp3.s2 - tmp3.s1;
- float out33 = tmp3.s1 - tmp3.s3;
+ DATA_TYPE out30 = tmp3.s0 - tmp3.s2;
+ DATA_TYPE out31 = tmp3.s1 + tmp3.s2;
+ DATA_TYPE out32 = tmp3.s2 - tmp3.s1;
+ DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(float) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#if defined(SRC_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
- *((__global float *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
- *((__global float *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
- *((__global float *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
- *((__global float *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
+ *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
+ *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- *((__global float *)(dst_addr + 4 * dst_stride_z)) = out10;
- *((__global float *)(dst_addr + 5 * dst_stride_z)) = out11;
- *((__global float *)(dst_addr + 6 * dst_stride_z)) = out12;
- *((__global float *)(dst_addr + 7 * dst_stride_z)) = out13;
- *((__global float *)(dst_addr + 8 * dst_stride_z)) = out20;
- *((__global float *)(dst_addr + 9 * dst_stride_z)) = out21;
- *((__global float *)(dst_addr + 10 * dst_stride_z)) = out22;
- *((__global float *)(dst_addr + 11 * dst_stride_z)) = out23;
- *((__global float *)(dst_addr + 12 * dst_stride_z)) = out30;
- *((__global float *)(dst_addr + 13 * dst_stride_z)) = out31;
- *((__global float *)(dst_addr + 14 * dst_stride_z)) = out32;
- *((__global float *)(dst_addr + 15 * dst_stride_z)) = out33;
+ *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out10;
+ *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out11;
+ *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out12;
+ *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out13;
+ *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out20;
+ *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out21;
+ *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
+ *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
+ *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
+ *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
+ *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
+ *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
}
@@ -160,8 +188,9 @@
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -177,107 +206,159 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
- int x = get_global_id(0);
- int y = get_global_id(1);
- int z = get_global_id(2) * 2;
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+ const int z = (get_global_id(2) * 2) % SRC_DEPTH;
+ const int b = (get_global_id(2) * 2) / SRC_DEPTH;
+#else /* defined(SRC_DEPTH) */
+ const int z = get_global_id(2) * 2;
+#endif /* defined(SRC_DEPTH) */
// Compute input address
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(float) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-
- src_addr = src_addr - ((int)PAD_LEFT * sizeof(float)) - ((int)PAD_TOP * src_stride_y);
+#if defined(SRC_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+ src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
- float4 in_row0 = vload4(0, (__global float *)(src_addr));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
- float4 in_row0 = (float4)(*((__global float *)(src_addr + 0 * src_stride_y)),
- *((__global float *)(src_addr + 1 * src_stride_y)),
- *((__global float *)(src_addr + 2 * src_stride_y)),
- *((__global float *)(src_addr + 3 * src_stride_y)));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- float4 in_row0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
- float4 in_row1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
- float4 in_row2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
- float4 in_row3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
src_addr += src_stride_z;
#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
- float4 in_row4 = vload4(0, (__global float *)(src_addr));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));
#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
- float4 in_row4 = (float4)(*((__global float *)(src_addr + 0 * src_stride_y)),
- *((__global float *)(src_addr + 1 * src_stride_y)),
- *((__global float *)(src_addr + 2 * src_stride_y)),
- *((__global float *)(src_addr + 3 * src_stride_y)));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- float4 in_row4 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
- float4 in_row5 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
- float4 in_row6 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
- float4 in_row7 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- float4 tmp0 = in_row0;
- float4 tmp4 = in_row4;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp0 = in_row0;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp4 = in_row4;
#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
tmp0 -= in_row2;
tmp4 -= in_row6;
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- float2 out00 = (float2)(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
- float2 out01 = (float2)(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
- float2 out02 = (float2)(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
- float2 out03 = (float2)(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- float4 tmp1 = in_row1 + in_row2;
- float4 tmp2 = in_row2 - in_row1;
- float4 tmp3 = in_row1 - in_row3;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp1 = in_row1 + in_row2;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp2 = in_row2 - in_row1;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp3 = in_row1 - in_row3;
- float4 tmp5 = in_row5 + in_row6;
- float4 tmp6 = in_row6 - in_row5;
- float4 tmp7 = in_row5 - in_row7;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp5 = in_row5 + in_row6;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp6 = in_row6 - in_row5;
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp7 = in_row5 - in_row7;
- float2 out10 = (float2)(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
- float2 out11 = (float2)(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
- float2 out12 = (float2)(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
- float2 out13 = (float2)(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
- float2 out20 = (float2)(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
- float2 out21 = (float2)(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
- float2 out22 = (float2)(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
- float2 out23 = (float2)(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
- float2 out30 = (float2)(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
- float2 out31 = (float2)(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
- float2 out32 = (float2)(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
- float2 out33 = (float2)(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(float) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#if defined(SRC_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
- vstore2(out00, 0, (__global float *)(dst_addr + 0 * dst_stride_z));
- vstore2(out01, 0, (__global float *)(dst_addr + 1 * dst_stride_z));
- vstore2(out02, 0, (__global float *)(dst_addr + 2 * dst_stride_z));
- vstore2(out03, 0, (__global float *)(dst_addr + 3 * dst_stride_z));
+ vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
+ vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
+ vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
+ vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- vstore2(out10, 0, (__global float *)(dst_addr + 4 * dst_stride_z));
- vstore2(out11, 0, (__global float *)(dst_addr + 5 * dst_stride_z));
- vstore2(out12, 0, (__global float *)(dst_addr + 6 * dst_stride_z));
- vstore2(out13, 0, (__global float *)(dst_addr + 7 * dst_stride_z));
- vstore2(out20, 0, (__global float *)(dst_addr + 8 * dst_stride_z));
- vstore2(out21, 0, (__global float *)(dst_addr + 9 * dst_stride_z));
- vstore2(out22, 0, (__global float *)(dst_addr + 10 * dst_stride_z));
- vstore2(out23, 0, (__global float *)(dst_addr + 11 * dst_stride_z));
- vstore2(out30, 0, (__global float *)(dst_addr + 12 * dst_stride_z));
- vstore2(out31, 0, (__global float *)(dst_addr + 13 * dst_stride_z));
- vstore2(out32, 0, (__global float *)(dst_addr + 14 * dst_stride_z));
- vstore2(out33, 0, (__global float *)(dst_addr + 15 * dst_stride_z));
+ vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
+ vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
+ vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
+ vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
+ vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
+ vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
+ vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
+ vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
+ vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
+ vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
+ vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
+ vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
}
@@ -289,8 +370,9 @@
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
* @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -306,40 +388,57 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
- int x = get_global_id(0);
- int y = get_global_id(1);
- int z = get_global_id(2);
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+ const int z = get_global_id(2) % SRC_DEPTH;
+ const int b = get_global_id(2) / SRC_DEPTH;
+#else /* defined(SRC_DEPTH) */
+ const int z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
// Compute input address
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(float) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#if defined(SRC_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
- src_addr = src_addr - ((int)PAD_LEFT * sizeof(float)) - ((int)PAD_TOP * src_stride_y);
+ src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
// Row0
- float4 d00 = (float4)(*((__global float *)(src_addr + 0 * src_stride_y)),
- *((__global float *)(src_addr + 1 * src_stride_y)),
- *((__global float *)(src_addr + 2 * src_stride_y)),
- *((__global float *)(src_addr + 3 * src_stride_y)));
- float2 d01 = (float2)(*((__global float *)(src_addr + 4 * src_stride_y)),
- *((__global float *)(src_addr + 5 * src_stride_y)));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));
#else // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
// Row0
- float4 d00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
- float2 d01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ d01 = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- float out0 = 0.0f;
- float out1 = 0.0f;
- float out2 = 0.0f;
- float out3 = 0.0f;
- float out4 = 0.0f;
- float out5 = 0.0f;
+ DATA_TYPE out0 = 0.0f;
+ DATA_TYPE out1 = 0.0f;
+ DATA_TYPE out2 = 0.0f;
+ DATA_TYPE out3 = 0.0f;
+ DATA_TYPE out4 = 0.0f;
+ DATA_TYPE out5 = 0.0f;
// Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
@@ -351,16 +450,18 @@
#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
// Row4
- float4 d40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
- float2 d41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
// k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4
- float k0 = d41.s0;
- float k1 = d41.s0;
- float k2 = d41.s0;
- float k3 = d41.s0;
- float k4 = d41.s0;
- float k5 = 0.0f;
+ DATA_TYPE k0 = d41.s0;
+ DATA_TYPE k1 = d41.s0;
+ DATA_TYPE k2 = d41.s0;
+ DATA_TYPE k3 = d41.s0;
+ DATA_TYPE k4 = d41.s0;
+ DATA_TYPE k5 = 0.0f;
k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
@@ -377,8 +478,10 @@
out5 += k5;
// Row2
- float4 d20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
- float2 d21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
@@ -389,9 +492,13 @@
#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
// Compute destination address
- __global float *dst_addr = (__global float *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(float) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+#if defined(SRC_DEPTH)
+ __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
+#else /* defined(SRC_DEPTH) */
+ __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+#endif /* defined(SRC_DEPTH) */
- uint dst_plane_stride = dst_stride_z / sizeof(float);
+ uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
*(dst_addr) = out0;
dst_addr += dst_plane_stride;
@@ -407,69 +514,73 @@
dst_addr += dst_plane_stride;
#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- float out6 = k0;
- float out7 = k1;
- float out8 = k2;
- float out9 = k3;
- float out10 = k4;
- float out11 = k5;
- float out12 = k0;
- float out13 = k1;
- float out14 = k2;
- float out15 = k3;
- float out16 = k4;
- float out17 = k5;
- float out18 = k0;
- float out19 = k1;
- float out20 = k2;
- float out21 = k3;
- float out22 = k4;
- float out23 = k5;
- float out24 = k0;
- float out25 = k1;
- float out26 = k2;
- float out27 = k3;
- float out28 = k4;
- float out29 = k5;
+ DATA_TYPE out6 = k0;
+ DATA_TYPE out7 = k1;
+ DATA_TYPE out8 = k2;
+ DATA_TYPE out9 = k3;
+ DATA_TYPE out10 = k4;
+ DATA_TYPE out11 = k5;
+ DATA_TYPE out12 = k0;
+ DATA_TYPE out13 = k1;
+ DATA_TYPE out14 = k2;
+ DATA_TYPE out15 = k3;
+ DATA_TYPE out16 = k4;
+ DATA_TYPE out17 = k5;
+ DATA_TYPE out18 = k0;
+ DATA_TYPE out19 = k1;
+ DATA_TYPE out20 = k2;
+ DATA_TYPE out21 = k3;
+ DATA_TYPE out22 = k4;
+ DATA_TYPE out23 = k5;
+ DATA_TYPE out24 = k0;
+ DATA_TYPE out25 = k1;
+ DATA_TYPE out26 = k2;
+ DATA_TYPE out27 = k3;
+ DATA_TYPE out28 = k4;
+ DATA_TYPE out29 = k5;
// Row1
- float4 d10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
- float2 d11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
// Row3
- float4 d30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
- float2 d31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
// Compute common parts for the channels between [6, 29]
// Channels [6, 11]: [out10, out11, out12, out13, out14, out15]
// Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
- float part0 = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
- float part1 = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
- float part2 = 16.0f * d20.s2 - 4.0f * d21.s0;
- float part3 = 16.0f * d20.s1 - 4.0f * d20.s3;
- float part4 = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
- float part5 = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
- float part6 = 4.0f * d20.s2 - 4.0f * d21.s0;
- float part7 = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
- float part8 = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
- float part9 = 8.0f * d20.s1 - 8.0f * d20.s3;
- float part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
- float part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
+ DATA_TYPE part0 = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
+ DATA_TYPE part1 = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
+ DATA_TYPE part2 = 16.0f * d20.s2 - 4.0f * d21.s0;
+ DATA_TYPE part3 = 16.0f * d20.s1 - 4.0f * d20.s3;
+ DATA_TYPE part4 = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
+ DATA_TYPE part5 = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
+ DATA_TYPE part6 = 4.0f * d20.s2 - 4.0f * d21.s0;
+ DATA_TYPE part7 = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
+ DATA_TYPE part8 = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
+ DATA_TYPE part9 = 8.0f * d20.s1 - 8.0f * d20.s3;
+ DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
+ DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
// Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
// Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
- float part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
- float part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
- float part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
- float part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
- float part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
- float part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
- float part18 = part6 * 0.25f; // d20.s2 - d21.s0
- float part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
- float part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
- float part21 = part9 * 0.25f; // 2.0f * (d20.s1 - d20.s3)
- float part22 = part10 * 0.25f; // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
- float part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
+ DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
+ DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
+ DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
+ DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
+ DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
+ DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
+ DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0
+ DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
+ DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
+ DATA_TYPE part21 = part9 * 0.25f; // 2.0f * (d20.s1 - d20.s3)
+ DATA_TYPE part22 = part10 * 0.25f; // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
+ DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
out6 += part0 - part1;
out12 += part0 + part1;
@@ -548,8 +659,10 @@
dst_addr += dst_plane_stride;
// Row5
- float4 d50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y));
- float2 d51 = vload2(2, (__global float *)(src_addr + 5 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
// Channels [30, 35]
out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
@@ -574,19 +687,17 @@
#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
}
-#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
-/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
*
* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
* @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -602,16 +713,234 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+ const int z = get_global_id(2) % SRC_DEPTH;
+ const int b = get_global_id(2) / SRC_DEPTH;
+#else /* defined(SRC_DEPTH) */
+ const int z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+ // Compute input address
+#if defined(SRC_DEPTH)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+ src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+ // Load input tile
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));
+#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+ const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ // Calculate common factors for intermediate tensor
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ tmp0 = in_row0;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact0 = 0.0f;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25 * in_row4;
+ tmp0 += -in_row6 + (DATA_TYPE)5.25 * in_row4 - (DATA_TYPE)5.25 * in_row2;
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25 * in_row3;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact2 = (DATA_TYPE)0.25 * in_row2 - (DATA_TYPE)1.25 * in_row4 + in_row6;
+
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
+
+ comm_fact0 = (DATA_TYPE)2.5 * in_row3;
+ comm_fact1 = (DATA_TYPE)0.5 * in_row1 - comm_fact0 + (DATA_TYPE)2.0 * in_row5;
+
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
+
+ comm_fact1 = (DATA_TYPE)2.0 * in_row1 - comm_fact0 + (DATA_TYPE)0.5 * in_row5;
+ comm_fact2 = (DATA_TYPE)4.0 * in_row2 - (DATA_TYPE)5.0 * in_row4 + in_row6;
+
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25 * in_row3 - (DATA_TYPE)5.25 * in_row5;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ // Calculate output rows (reuse comm_fact0 vector)
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0;
+
+ OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out1, out2, out3, out4, out5, out6, out7;
+
+ OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
+ OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ // Store values across the channels
+#if defined(SRC_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
+ *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
+ *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
+ *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
+ *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
+ *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
+ *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+ *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
+ *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
+ *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+ *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+ *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+ *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+ *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+ *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+ *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+ *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+ *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+ *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+ *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+ *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+ *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+ *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+ *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+ *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+ *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+ *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+ *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+ *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+ *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+ *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+ *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+ *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+ *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+ *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+ *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+ *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+ *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+ *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+ *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+ *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+ *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+ *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+ *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+ *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+ *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+ *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+ *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+ *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+ *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+ *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+ *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+ *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+ *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+ *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+ *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+ *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+ *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+ *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+ *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+ *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+ *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+ *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
+/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
- int x = get_global_id(0);
- int y = get_global_id(1);
- int z = get_global_id(2);
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+#if defined(NUM_TILES_Y)
+ const int z = get_global_id(2) % NUM_TILES_Y;
+ const int b = get_global_id(2) / NUM_TILES_Y;
+#else /* defined(NUM_TILES_Y) */
+ const int z = get_global_id(2);
+#endif /* defined(NUM_TILES_Y) */
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(float);
+#if defined(NUM_TILES_Y)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
+#else /* defined(NUM_TILES_Y) */
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+#endif /* defined(NUM_TILES_Y) */
// Clamp coordinates. This clamp is valid for all rows
int4 y_coord0 = (int4)(y * OUTPUT_TILE_W) + (int4)(0, 1, 2, 3) - (int4)PAD_LEFT;
@@ -637,19 +966,19 @@
// Clamp z coordinate
z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
- float d40 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
- float d41 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
- float d42 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
- float d43 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
- float d44 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
- float d45 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d40 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d41 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d42 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d43 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d44 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d45 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
- float k0 = d44;
- float k1 = d44;
- float k2 = d44;
- float k3 = d44;
- float k4 = d44;
- float k5 = (float)0.0f;
+ DATA_TYPE k0 = d44;
+ DATA_TYPE k1 = d44;
+ DATA_TYPE k2 = d44;
+ DATA_TYPE k3 = d44;
+ DATA_TYPE k4 = d44;
+ DATA_TYPE k5 = (DATA_TYPE)0.0f;
k0 += 4.0f * d40 - 5.0f * d42;
k1 += -4.0f * d41 - 4.0f * d42 + d43;
@@ -674,12 +1003,12 @@
valid_y1 = y_coord1;
#endif // if PAD_TOP == 0, we cannot read out of bound
- float d00 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
- float d01 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
- float d02 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
- float d03 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
- float d04 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
- float d05 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
#else // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
int4 z_coords0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;
int2 z_coords1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;
@@ -692,20 +1021,20 @@
z_coords0 = clamp((int4)z_coords0, (int4)0, (int4)((int)SRC_DIM_2 - 1));
z_coords1 = clamp((int2)z_coords1, (int2)0, (int2)((int)SRC_DIM_2 - 1));
- float d00 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coords0.s0 * src_stride_z);
- float d01 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coords0.s1 * src_stride_z);
- float d02 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coords0.s2 * src_stride_z);
- float d03 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coords0.s3 * src_stride_z);
- float d04 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coords1.s0 * src_stride_z);
- float d05 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coords1.s1 * src_stride_z);
+ DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coords0.s0 * src_stride_z);
+ DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coords0.s1 * src_stride_z);
+ DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coords0.s2 * src_stride_z);
+ DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coords0.s3 * src_stride_z);
+ DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coords1.s0 * src_stride_z);
+ DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coords1.s1 * src_stride_z);
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- float out0 = 16.0f * d00 - 20.0f * d02 + 4.0f * d04;
- float out1 = -16.0f * d01 - 16.0f * d02 + 4.0f * d03 + 4.0f * d04;
- float out2 = 16.0f * d01 - 16.0f * d02 - 4.0f * d03 + 4.0f * d04;
- float out3 = -8.0f * d01 - 4.0f * d02 + 8.0f * d03 + 4.0f * d04;
- float out4 = 8.0f * d01 - 4.0f * d02 - 8.0f * d03 + 4.0f * d04;
- float out5 = 16.0f * d01 - 20.0f * d03 + 4.0f * d05;
+ DATA_TYPE out0 = 16.0f * d00 - 20.0f * d02 + 4.0f * d04;
+ DATA_TYPE out1 = -16.0f * d01 - 16.0f * d02 + 4.0f * d03 + 4.0f * d04;
+ DATA_TYPE out2 = 16.0f * d01 - 16.0f * d02 - 4.0f * d03 + 4.0f * d04;
+ DATA_TYPE out3 = -8.0f * d01 - 4.0f * d02 + 8.0f * d03 + 4.0f * d04;
+ DATA_TYPE out4 = 8.0f * d01 - 4.0f * d02 - 8.0f * d03 + 4.0f * d04;
+ DATA_TYPE out5 = 16.0f * d01 - 20.0f * d03 + 4.0f * d05;
#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
// Row2
@@ -716,12 +1045,12 @@
valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
- float d20 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
- float d21 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
- float d22 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
- float d23 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
- float d24 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
- float d25 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d20 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d21 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d22 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d23 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d24 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d25 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
out0 += k0;
out1 += k1;
@@ -729,30 +1058,30 @@
out3 += k3;
out4 += k4;
out5 += k5;
- float out6 = k0;
- float out7 = k1;
- float out8 = k2;
- float out9 = k3;
- float out10 = k4;
- float out11 = k5;
- float out12 = k0;
- float out13 = k1;
- float out14 = k2;
- float out15 = k3;
- float out16 = k4;
- float out17 = k5;
- float out18 = k0;
- float out19 = k1;
- float out20 = k2;
- float out21 = k3;
- float out22 = k4;
- float out23 = k5;
- float out24 = k0;
- float out25 = k1;
- float out26 = k2;
- float out27 = k3;
- float out28 = k4;
- float out29 = k5;
+ DATA_TYPE out6 = k0;
+ DATA_TYPE out7 = k1;
+ DATA_TYPE out8 = k2;
+ DATA_TYPE out9 = k3;
+ DATA_TYPE out10 = k4;
+ DATA_TYPE out11 = k5;
+ DATA_TYPE out12 = k0;
+ DATA_TYPE out13 = k1;
+ DATA_TYPE out14 = k2;
+ DATA_TYPE out15 = k3;
+ DATA_TYPE out16 = k4;
+ DATA_TYPE out17 = k5;
+ DATA_TYPE out18 = k0;
+ DATA_TYPE out19 = k1;
+ DATA_TYPE out20 = k2;
+ DATA_TYPE out21 = k3;
+ DATA_TYPE out22 = k4;
+ DATA_TYPE out23 = k5;
+ DATA_TYPE out24 = k0;
+ DATA_TYPE out25 = k1;
+ DATA_TYPE out26 = k2;
+ DATA_TYPE out27 = k3;
+ DATA_TYPE out28 = k4;
+ DATA_TYPE out29 = k5;
// Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
out0 += -20.0f * d20 + 25.0f * d22 - 5.0f * d24;
@@ -764,20 +1093,25 @@
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
// Compute destination address
- __global float *dst_addr = (__global float *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(float) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
- uint dst_plane_stride = dst_stride_z / sizeof(float);
+#if defined(NUM_TILES_Y)
+ __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
+#else /* defined(NUM_TILES_Y) */
+ __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
+#endif /* defined(NUM_TILES_Y) */
- *((__global float *)dst_addr) = out0;
+ uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
+
+ *((__global DATA_TYPE *)dst_addr) = out0;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out1;
+ *((__global DATA_TYPE *)dst_addr) = out1;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out2;
+ *((__global DATA_TYPE *)dst_addr) = out2;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out3;
+ *((__global DATA_TYPE *)dst_addr) = out3;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out4;
+ *((__global DATA_TYPE *)dst_addr) = out4;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out5;
+ *((__global DATA_TYPE *)dst_addr) = out5;
dst_addr += dst_plane_stride;
#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
@@ -787,12 +1121,12 @@
valid_y0 = y_coord0;
valid_y1 = y_coord1;
- float d10 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
- float d11 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
- float d12 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
- float d13 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
- float d14 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
- float d15 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d10 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d11 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d12 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d13 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d14 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d15 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
// Row3
z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;
@@ -803,43 +1137,43 @@
z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
- float d30 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
- float d31 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
- float d32 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
- float d33 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
- float d34 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
- float d35 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d30 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d31 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d32 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d33 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d34 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d35 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
// Compute common parts for the channels between [6, 29]
// Channels [6, 11]: [out10, out11, out12, out13, out14, out15]
// Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
- float part0 = -16.0f * d20 + 20.0f * d22 - 4.0f * d24;
- float part1 = 16.0f * d10 - 20.0f * d12 + 4.0f * d14 - 4.0f * d30 + 5.0f * d32 - d34;
- float part2 = 16.0f * d22 - 4.0f * d24;
- float part3 = 16.0f * d21 - 4.0f * d23;
- float part4 = 16.0f * d12 - 4.0f * d14 - 4.0f * d32 + d34;
- float part5 = 16.0f * d11 - 4.0f * d13 - 4.0f * d31 + d33;
- float part6 = 4.0f * d22 - 4.0f * d24;
- float part7 = 8.0f * d11 - 8.0f * d13 - 2.0f * d31 + 2.0f * d33;
- float part8 = 4.0f * d12 - 4.0f * d14 - d32 + d34;
- float part9 = 8.0f * d21 - 8.0f * d23;
- float part10 = -16.0f * d21 + 20.0f * d23 - 4.0f * d25;
- float part11 = -16.0f * d11 + 20.0f * d13 - 4.0f * d15 + 4.0f * d31 - 5.0f * d33 + d35;
+ DATA_TYPE part0 = -16.0f * d20 + 20.0f * d22 - 4.0f * d24;
+ DATA_TYPE part1 = 16.0f * d10 - 20.0f * d12 + 4.0f * d14 - 4.0f * d30 + 5.0f * d32 - d34;
+ DATA_TYPE part2 = 16.0f * d22 - 4.0f * d24;
+ DATA_TYPE part3 = 16.0f * d21 - 4.0f * d23;
+ DATA_TYPE part4 = 16.0f * d12 - 4.0f * d14 - 4.0f * d32 + d34;
+ DATA_TYPE part5 = 16.0f * d11 - 4.0f * d13 - 4.0f * d31 + d33;
+ DATA_TYPE part6 = 4.0f * d22 - 4.0f * d24;
+ DATA_TYPE part7 = 8.0f * d11 - 8.0f * d13 - 2.0f * d31 + 2.0f * d33;
+ DATA_TYPE part8 = 4.0f * d12 - 4.0f * d14 - d32 + d34;
+ DATA_TYPE part9 = 8.0f * d21 - 8.0f * d23;
+ DATA_TYPE part10 = -16.0f * d21 + 20.0f * d23 - 4.0f * d25;
+ DATA_TYPE part11 = -16.0f * d11 + 20.0f * d13 - 4.0f * d15 + 4.0f * d31 - 5.0f * d33 + d35;
// Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
// Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
- float part12 = 8.0f * d10 - 10.0f * d12 + 2.0f * d14 - 8.0f * d30 + 10.0f * d32 - 2.0f * d34;
- float part13 = part0 * 0.25f; // -4.0f * d20 + 5.0f * d22 - d24
- float part14 = part2 * 0.25f; // 4.0f * d22 - d24
- float part15 = 8.0f * d11 - 2.0f * d13 - 8.0f * d31 + 2.0f * d33;
- float part16 = 8.0f * d12 - 2.0f * d14 - 8.0f * d32 + 2.0f * d34;
- float part17 = part3 * 0.25f; // 4.0f * d21 - d23
- float part18 = part6 * 0.25f; // d22 - d24
- float part19 = 4.0f * d11 - 4.0f * d13 - 4.0f * d31 + 4.0f * d33;
- float part20 = 2.0f * d12 - 2.0f * d14 - 2.0f * d32 + 2.0f * d34;
- float part21 = part9 * 0.25f; // 2.0f * (d21 - d23)
- float part22 = part10 * 0.25f; // - 4.0f * d21 + 5.0f * d23 - d25
- float part23 = part11 * 0.5f + 6.0f * d31 - 7.5f * d33 + 1.5f * d35; // - 8.0f * d11 + 10.0f * d13 - 2.0f * d15 + 8.0f * d31 - 10.0f * d33 + 2.0f * d35;
+ DATA_TYPE part12 = 8.0f * d10 - 10.0f * d12 + 2.0f * d14 - 8.0f * d30 + 10.0f * d32 - 2.0f * d34;
+ DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20 + 5.0f * d22 - d24
+ DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d22 - d24
+ DATA_TYPE part15 = 8.0f * d11 - 2.0f * d13 - 8.0f * d31 + 2.0f * d33;
+ DATA_TYPE part16 = 8.0f * d12 - 2.0f * d14 - 8.0f * d32 + 2.0f * d34;
+ DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d21 - d23
+ DATA_TYPE part18 = part6 * 0.25f; // d22 - d24
+ DATA_TYPE part19 = 4.0f * d11 - 4.0f * d13 - 4.0f * d31 + 4.0f * d33;
+ DATA_TYPE part20 = 2.0f * d12 - 2.0f * d14 - 2.0f * d32 + 2.0f * d34;
+ DATA_TYPE part21 = part9 * 0.25f; // 2.0f * (d21 - d23)
+ DATA_TYPE part22 = part10 * 0.25f; // - 4.0f * d21 + 5.0f * d23 - d25
+ DATA_TYPE part23 = part11 * 0.5f + 6.0f * d31 - 7.5f * d33 + 1.5f * d35; // - 8.0f * d11 + 10.0f * d13 - 2.0f * d15 + 8.0f * d31 - 10.0f * d33 + 2.0f * d35;
out6 += part0 - part1;
out12 += part0 + part1;
@@ -867,54 +1201,54 @@
out23 += part22 + part23;
out29 += part22 - part23;
- *((__global float *)dst_addr) = out6;
+ *((__global DATA_TYPE *)dst_addr) = out6;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out7;
+ *((__global DATA_TYPE *)dst_addr) = out7;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out8;
+ *((__global DATA_TYPE *)dst_addr) = out8;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out9;
+ *((__global DATA_TYPE *)dst_addr) = out9;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out10;
+ *((__global DATA_TYPE *)dst_addr) = out10;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out11;
+ *((__global DATA_TYPE *)dst_addr) = out11;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out12;
+ *((__global DATA_TYPE *)dst_addr) = out12;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out13;
+ *((__global DATA_TYPE *)dst_addr) = out13;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out14;
+ *((__global DATA_TYPE *)dst_addr) = out14;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out15;
+ *((__global DATA_TYPE *)dst_addr) = out15;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out16;
+ *((__global DATA_TYPE *)dst_addr) = out16;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out17;
+ *((__global DATA_TYPE *)dst_addr) = out17;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out18;
+ *((__global DATA_TYPE *)dst_addr) = out18;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out19;
+ *((__global DATA_TYPE *)dst_addr) = out19;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out20;
+ *((__global DATA_TYPE *)dst_addr) = out20;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out21;
+ *((__global DATA_TYPE *)dst_addr) = out21;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out22;
+ *((__global DATA_TYPE *)dst_addr) = out22;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out23;
+ *((__global DATA_TYPE *)dst_addr) = out23;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out24;
+ *((__global DATA_TYPE *)dst_addr) = out24;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out25;
+ *((__global DATA_TYPE *)dst_addr) = out25;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out26;
+ *((__global DATA_TYPE *)dst_addr) = out26;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out27;
+ *((__global DATA_TYPE *)dst_addr) = out27;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out28;
+ *((__global DATA_TYPE *)dst_addr) = out28;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out29;
+ *((__global DATA_TYPE *)dst_addr) = out29;
dst_addr += dst_plane_stride;
// Row5
@@ -926,12 +1260,12 @@
z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
- float d50 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
- float d51 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
- float d52 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
- float d53 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
- float d54 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
- float d55 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d50 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d51 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d52 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d53 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d54 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ DATA_TYPE d55 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
// Channels [30, 35]
out0 = 16.0f * d10 - 20.0f * d12 - 20.0f * d30 + 25.0f * d32 + 4.0f * d50 - 5.0f * d52 + d54 + 4.0f * d14 - 5.0f * d34;
@@ -941,17 +1275,17 @@
out4 = 8.0f * d11 - 4.0f * d12 - 8.0f * d13 - 10.0f * d31 + 5.0f * d32 + 10.0f * d33 + 2.0f * d51 - 2.0f * d53 - d52 + d54 + 4.0f * d14 - 5.0f * d34;
out5 = 16.0f * d11 - 20.0f * d13 + 4.0f * d15 - 20.0f * d31 + 25.0f * d33 - 5.0f * d35 + 4.0f * d51 - 5.0f * d53 + d55;
- *((__global float *)dst_addr) = out0;
+ *((__global DATA_TYPE *)dst_addr) = out0;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out1;
+ *((__global DATA_TYPE *)dst_addr) = out1;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out2;
+ *((__global DATA_TYPE *)dst_addr) = out2;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out3;
+ *((__global DATA_TYPE *)dst_addr) = out3;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out4;
+ *((__global DATA_TYPE *)dst_addr) = out4;
dst_addr += dst_plane_stride;
- *((__global float *)dst_addr) = out5;
+ *((__global DATA_TYPE *)dst_addr) = out5;
dst_addr += dst_plane_stride;
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
}
@@ -966,8 +1300,9 @@
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
* @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -983,17 +1318,30 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
- int x = get_global_id(0);
- int y = get_global_id(1);
- int z = get_global_id(2);
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+#if defined(NUM_TILES_Y)
+ const int z = get_global_id(2) % NUM_TILES_Y;
+ const int b = get_global_id(2) / NUM_TILES_Y;
+#else /* defined(NUM_TILES_Y) */
+ const int z = get_global_id(2);
+#endif /* defined(NUM_TILES_Y) */
// Compute input address
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(float);
+#if defined(NUM_TILES_Y)
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
+#else /* defined(NUM_TILES_Y) */
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+#endif /* defined(NUM_TILES_Y) */
#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
// Clamp coordinates. This clamp is valid for all rows
@@ -1005,21 +1353,25 @@
int z_coord = z * OUTPUT_TILE_H;
// Load the input tile
- float8 in_row0;
- in_row0.s0 = *(__global float *)(src_addr + y_coord.s0 * (int)src_stride_y + z_coord * src_stride_z);
- in_row0.s1 = *(__global float *)(src_addr + y_coord.s1 * (int)src_stride_y + z_coord * src_stride_z);
- in_row0.s2 = *(__global float *)(src_addr + y_coord.s2 * (int)src_stride_y + z_coord * src_stride_z);
- in_row0.s3 = *(__global float *)(src_addr + y_coord.s3 * (int)src_stride_y + z_coord * src_stride_z);
- in_row0.s4 = *(__global float *)(src_addr + y_coord.s4 * (int)src_stride_y + z_coord * src_stride_z);
- in_row0.s5 = *(__global float *)(src_addr + y_coord.s5 * (int)src_stride_y + z_coord * src_stride_z);
- in_row0.s6 = *(__global float *)(src_addr + y_coord.s6 * (int)src_stride_y + z_coord * src_stride_z);
- in_row0.s7 = *(__global float *)(src_addr + y_coord.s7 * (int)src_stride_y + z_coord * src_stride_z);
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ in_row0;
+ in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord.s4 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord.s5 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord.s6 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Calculate common factors for intermediate tensor
- float8 comm_fact0 = 0.0f;
- float8 tmp0 = in_row0;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact0 = 0.0f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ tmp0 = in_row0;
- float8 out0 = (float8)0.0f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
@@ -1035,25 +1387,30 @@
z_coord = clamp(z_coord, (int8)0, (int8)SRC_DIM_2 - 1); // Clamp z coordinate
// Load the input tile
- float8 in_row0;
- in_row0.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord.s0 * src_stride_z);
- in_row0.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord.s1 * src_stride_z);
- in_row0.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord.s2 * src_stride_z);
- in_row0.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord.s3 * src_stride_z);
- in_row0.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord.s4 * src_stride_z);
- in_row0.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord.s5 * src_stride_z);
- in_row0.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord.s6 * src_stride_z);
- in_row0.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord.s7 * src_stride_z);
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ in_row0;
+ in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord.s0 * src_stride_z);
+ in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord.s1 * src_stride_z);
+ in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord.s2 * src_stride_z);
+ in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord.s3 * src_stride_z);
+ in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord.s4 * src_stride_z);
+ in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord.s5 * src_stride_z);
+ in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord.s6 * src_stride_z);
+ in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord.s7 * src_stride_z);
// Calculate common factors for intermediate tensor
- float8 comm_fact0 = 0.0f;
- float8 tmp0 = in_row0;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact0 = 0.0f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ tmp0 = in_row0;
- float8 out0 = (float8)0.0f;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
- float8 in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
// Clamp coordinates. This clamp is valid for all rows
int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
@@ -1066,14 +1423,14 @@
z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1); // Clamp z coordinate
// Load the input tile
- in_row0.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
- in_row0.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
- in_row0.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
- in_row0.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
- in_row0.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
- in_row0.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
- in_row0.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
- in_row0.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Row1
z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;
@@ -1081,14 +1438,14 @@
valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
- in_row1.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
- in_row1.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
- in_row1.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
- in_row1.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
- in_row1.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
- in_row1.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
- in_row1.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
- in_row1.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row1.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row1.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row1.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row1.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row1.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row1.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row1.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row1.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Row2
z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;
@@ -1096,14 +1453,14 @@
valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
- in_row2.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
- in_row2.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
- in_row2.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
- in_row2.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
- in_row2.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
- in_row2.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
- in_row2.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
- in_row2.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row2.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row2.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row2.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row2.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row2.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row2.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row2.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row2.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Row3
z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;
@@ -1111,14 +1468,14 @@
valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
- in_row3.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
- in_row3.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
- in_row3.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
- in_row3.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
- in_row3.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
- in_row3.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
- in_row3.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
- in_row3.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row3.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row3.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row3.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row3.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row3.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row3.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row3.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row3.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Row4
z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 4;
@@ -1126,14 +1483,14 @@
valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
- in_row4.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
- in_row4.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
- in_row4.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
- in_row4.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
- in_row4.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
- in_row4.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
- in_row4.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
- in_row4.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row4.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row4.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row4.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row4.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row4.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row4.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row4.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row4.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Row5
z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;
@@ -1141,14 +1498,14 @@
valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
- in_row5.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
- in_row5.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
- in_row5.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
- in_row5.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
- in_row5.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
- in_row5.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
- in_row5.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
- in_row5.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row5.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row5.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row5.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row5.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row5.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row5.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row5.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row5.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Row6
z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 6;
@@ -1156,14 +1513,14 @@
valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
- in_row6.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
- in_row6.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
- in_row6.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
- in_row6.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
- in_row6.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
- in_row6.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
- in_row6.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
- in_row6.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row6.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row6.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row6.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row6.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row6.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row6.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row6.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row6.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
// Row7
z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 7;
@@ -1171,39 +1528,43 @@
valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
- in_row7.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
- in_row7.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
- in_row7.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
- in_row7.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
- in_row7.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
- in_row7.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
- in_row7.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
- in_row7.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row7.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row7.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row7.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row7.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row7.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row7.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row7.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+ in_row7.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
- float8 comm_fact0 = in_row2 + in_row6 - 4.25f * in_row4;
- float8 comm_fact1 = in_row1 + in_row5 - 4.25f * in_row3;
- float8 comm_fact2 = 0.25f * in_row2 - 1.25f * in_row4 + in_row6;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact0 = in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
// Calculate intermediate tensor and reuse common factor vectors
- const float8 tmp0 = in_row0 - in_row6 + 5.25f * in_row4 - 5.25f * in_row2;
- const float8 tmp1 = comm_fact0 + comm_fact1;
- const float8 tmp2 = comm_fact0 - comm_fact1;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp0 = in_row0 - in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
- comm_fact0 = 2.5f * in_row3;
- comm_fact1 = 0.5f * in_row1 - comm_fact0 + 2.f * in_row5;
+ comm_fact0 = (DATA_TYPE)2.5f * in_row3;
+ comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.f * in_row5;
- const float8 tmp3 = comm_fact1 + comm_fact2;
- const float8 tmp4 = comm_fact2 - comm_fact1;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
- comm_fact1 = 2.f * in_row1 - comm_fact0 + 0.5f * in_row5;
- comm_fact2 = 4.f * in_row2 - 5.f * in_row4 + in_row6;
+ comm_fact1 = (DATA_TYPE)2.f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
+ comm_fact2 = (DATA_TYPE)4.f * in_row2 - (DATA_TYPE)5.f * in_row4 + in_row6;
- const float8 tmp5 = comm_fact1 + comm_fact2;
- const float8 tmp6 = comm_fact2 - comm_fact1;
- const float8 tmp7 = in_row7 - in_row1 + 5.25f * in_row3 - 5.25f * in_row5;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
+ const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
// Calculate output rows (reuse comm_fact0 vector)
- float8 out0, out1, out2, out3, out4, out5, out6, out7;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0, out1, out2, out3, out4, out5, out6, out7;
OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
@@ -1212,260 +1573,85 @@
OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
// Store values across the channels
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(float) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
+#if defined(NUM_TILES_Y)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else /* NUM_TILES_Y */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* NUM_TILES_Y */
- *((__global float *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
- *((__global float *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
- *((__global float *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
- *((__global float *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
- *((__global float *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
- *((__global float *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
- *((__global float *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
- *((__global float *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
+ *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
+ *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
+ *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
+ *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
+ *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
+ *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- *((__global float *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
- *((__global float *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
- *((__global float *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
- *((__global float *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
- *((__global float *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
- *((__global float *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
- *((__global float *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
- *((__global float *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
- *((__global float *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
- *((__global float *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
- *((__global float *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
- *((__global float *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
- *((__global float *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
- *((__global float *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
- *((__global float *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
- *((__global float *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
- *((__global float *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
- *((__global float *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
- *((__global float *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
- *((__global float *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
- *((__global float *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
- *((__global float *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
- *((__global float *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
- *((__global float *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
- *((__global float *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
- *((__global float *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
- *((__global float *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
- *((__global float *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
- *((__global float *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
- *((__global float *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
- *((__global float *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
- *((__global float *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
- *((__global float *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
- *((__global float *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
- *((__global float *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
- *((__global float *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
- *((__global float *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
- *((__global float *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
- *((__global float *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
- *((__global float *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
- *((__global float *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
- *((__global float *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
- *((__global float *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
- *((__global float *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
- *((__global float *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
- *((__global float *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
- *((__global float *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
- *((__global float *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
- *((__global float *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
- *((__global float *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
- *((__global float *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
- *((__global float *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
- *((__global float *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
- *((__global float *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
- *((__global float *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
- *((__global float *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+ *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
+ *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
+ *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+ *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+ *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+ *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+ *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+ *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+ *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+ *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+ *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+ *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+ *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+ *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+ *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+ *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+ *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+ *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+ *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+ *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+ *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+ *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+ *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+ *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+ *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+ *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+ *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+ *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+ *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+ *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+ *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+ *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+ *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+ *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+ *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+ *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+ *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+ *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+ *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+ *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+ *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+ *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+ *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+ *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+ *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+ *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+ *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+ *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+ *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+ *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+ *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+ *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+ *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+ *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+ *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+ *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
}
#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
-/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
- int z = get_global_id(2);
-
- // Compute input address
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(float) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-
- src_addr = src_addr - ((int)PAD_LEFT * sizeof(float)) - ((int)PAD_TOP * src_stride_y);
-
- // Load input tile
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
- const float8 in_row0 = vload8(0, (__global float *)(src_addr));
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
- const float8 in_row0 = (float8)(*((__global float *)(src_addr + 0 * src_stride_y)),
- *((__global float *)(src_addr + 1 * src_stride_y)),
- *((__global float *)(src_addr + 2 * src_stride_y)),
- *((__global float *)(src_addr + 3 * src_stride_y)),
- *((__global float *)(src_addr + 4 * src_stride_y)),
- *((__global float *)(src_addr + 5 * src_stride_y)),
- *((__global float *)(src_addr + 6 * src_stride_y)),
- *((__global float *)(src_addr + 7 * src_stride_y)));
-#else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- const float8 in_row0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));
- const float8 in_row1 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));
- const float8 in_row2 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));
- const float8 in_row3 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));
- const float8 in_row4 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));
- const float8 in_row5 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));
- const float8 in_row6 = vload8(0, (__global float *)(src_addr + 6 * src_stride_y));
- const float8 in_row7 = vload8(0, (__global float *)(src_addr + 7 * src_stride_y));
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
- // Calculate common factors for intermediate tensor
- float8 tmp0 = in_row0;
- float8 comm_fact0 = 0.0f;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- comm_fact0 += in_row2 + in_row6 - 4.25f * in_row4;
- tmp0 += -in_row6 + 5.25f * in_row4 - 5.25f * in_row2;
-
- float8 comm_fact1 = in_row1 + in_row5 - 4.25f * in_row3;
- float8 comm_fact2 = 0.25f * in_row2 - 1.25f * in_row4 + in_row6;
-
- const float8 tmp1 = comm_fact0 + comm_fact1;
- const float8 tmp2 = comm_fact0 - comm_fact1;
-
- comm_fact0 = 2.5f * in_row3;
- comm_fact1 = 0.5f * in_row1 - comm_fact0 + 2.f * in_row5;
-
- const float8 tmp3 = comm_fact1 + comm_fact2;
- const float8 tmp4 = comm_fact2 - comm_fact1;
-
- comm_fact1 = 2.f * in_row1 - comm_fact0 + 0.5f * in_row5;
- comm_fact2 = 4.f * in_row2 - 5.f * in_row4 + in_row6;
-
- const float8 tmp5 = comm_fact1 + comm_fact2;
- const float8 tmp6 = comm_fact2 - comm_fact1;
- const float8 tmp7 = in_row7 - in_row1 + 5.25f * in_row3 - 5.25f * in_row5;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
- // Calculate output rows (reuse comm_fact0 vector)
- float8 out0;
-
- OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- float8 out1, out2, out3, out4, out5, out6, out7;
-
- OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
- OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
- OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
- OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
- OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
- OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
- OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
- // Store values across the channels
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(float) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
-
- *((__global float *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
- *((__global float *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
- *((__global float *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
- *((__global float *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
- *((__global float *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
- *((__global float *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
- *((__global float *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
- *((__global float *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
- *((__global float *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
- *((__global float *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
- *((__global float *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
- *((__global float *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
- *((__global float *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
- *((__global float *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
- *((__global float *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
- *((__global float *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
- *((__global float *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
- *((__global float *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
- *((__global float *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
- *((__global float *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
- *((__global float *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
- *((__global float *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
- *((__global float *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
- *((__global float *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
- *((__global float *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
- *((__global float *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
- *((__global float *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
- *((__global float *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
- *((__global float *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
- *((__global float *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
- *((__global float *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
- *((__global float *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
- *((__global float *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
- *((__global float *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
- *((__global float *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
- *((__global float *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
- *((__global float *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
- *((__global float *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
- *((__global float *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
- *((__global float *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
- *((__global float *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
- *((__global float *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
- *((__global float *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
- *((__global float *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
- *((__global float *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
- *((__global float *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
- *((__global float *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
- *((__global float *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
- *((__global float *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
- *((__global float *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
- *((__global float *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
- *((__global float *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
- *((__global float *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
- *((__global float *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
- *((__global float *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
- *((__global float *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
- *((__global float *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
- *((__global float *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
- *((__global float *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
- *((__global float *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
- *((__global float *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
- *((__global float *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
- *((__global float *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
- *((__global float *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
*
@@ -1474,8 +1660,9 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -1491,10 +1678,14 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
src_stride_x,
@@ -1511,7 +1702,9 @@
dst_step_y,
dst_stride_z,
dst_step_z,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
}
/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2
@@ -1521,8 +1714,9 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -1538,10 +1732,14 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
src_stride_x,
@@ -1558,7 +1756,9 @@
dst_step_y,
dst_stride_z,
dst_step_z,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
}
/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1
@@ -1568,8 +1768,9 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -1585,10 +1786,14 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
src_stride_x,
@@ -1605,7 +1810,9 @@
dst_step_y,
dst_stride_z,
dst_step_z,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
}
/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW
@@ -1615,8 +1822,9 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -1632,10 +1840,14 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
src_stride_x,
@@ -1652,7 +1864,9 @@
dst_step_y,
dst_stride_z,
dst_step_z,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
}
#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
@@ -1665,8 +1879,9 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -1682,10 +1897,14 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
src_stride_x,
@@ -1702,7 +1921,9 @@
dst_step_y,
dst_stride_z,
dst_step_z,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
}
/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
@@ -1714,8 +1935,9 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
* @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -1731,10 +1953,14 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
src_stride_x,
@@ -1751,9 +1977,11 @@
dst_step_y,
dst_stride_z,
dst_step_z,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
}
-#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
+#endif // defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2)
#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
@@ -1764,8 +1992,9 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -1781,10 +2010,14 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
src_stride_x,
@@ -1801,7 +2034,9 @@
dst_step_y,
dst_stride_z,
dst_step_z,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
}
/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2
@@ -1811,8 +2046,9 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -1828,10 +2064,14 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
src_stride_x,
@@ -1848,7 +2088,9 @@
dst_step_y,
dst_stride_z,
dst_step_z,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
}
/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4
@@ -1858,8 +2100,9 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -1875,10 +2118,14 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
src_stride_x,
@@ -1895,7 +2142,9 @@
dst_step_y,
dst_stride_z,
dst_step_z,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
}
/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4
@@ -1905,8 +2154,9 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -1922,10 +2172,14 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
src_stride_x,
@@ -1942,7 +2196,9 @@
dst_step_y,
dst_stride_z,
dst_step_z,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
}
#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
@@ -1955,8 +2211,9 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -1972,10 +2229,14 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
src_stride_x,
@@ -1992,7 +2253,9 @@
dst_step_y,
dst_stride_z,
dst_step_z,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
}
/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
@@ -2004,8 +2267,9 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
* @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
@@ -2021,10 +2285,14 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+ TENSOR3D_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
src_stride_x,
@@ -2041,7 +2309,9 @@
dst_step_y,
dst_stride_z,
dst_step_z,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ src_stride_w,
+ dst_stride_w);
}
#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl
index a1e7b3e..f52b027 100644
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_output_transform.cl
@@ -31,27 +31,32 @@
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void winograd_output_transform_2x2_3x3_nchw(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst)
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
#if defined(HAS_BIAS)
,
VECTOR_DECLARATION(bias)
@@ -59,15 +64,19 @@
)
{
// Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
+#if defined(SRC_DEPTH)
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else /* defined(SRC_DEPTH) */
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
// Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
- float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
- float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
- float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
- float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
+ DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
// Compute the 2x1 or 1x2 output tile
@@ -77,20 +86,20 @@
float out00 = d00 + d01 + d02;
float out01 = d01 - d02 - d03;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- float d10 = *((__global float *)(src_addr + 4 * src_stride_z));
- float d11 = *((__global float *)(src_addr + 5 * src_stride_z));
- float d12 = *((__global float *)(src_addr + 6 * src_stride_z));
- float d13 = *((__global float *)(src_addr + 7 * src_stride_z));
+ DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+ DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+ DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+ DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
- float d20 = *((__global float *)(src_addr + 8 * src_stride_z));
- float d21 = *((__global float *)(src_addr + 9 * src_stride_z));
- float d22 = *((__global float *)(src_addr + 10 * src_stride_z));
- float d23 = *((__global float *)(src_addr + 11 * src_stride_z));
+ DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+ DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+ DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+ DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
- float d30 = *((__global float *)(src_addr + 12 * src_stride_z));
- float d31 = *((__global float *)(src_addr + 13 * src_stride_z));
- float d32 = *((__global float *)(src_addr + 14 * src_stride_z));
- float d33 = *((__global float *)(src_addr + 15 * src_stride_z));
+ DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+ DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+ DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+ DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
// Compute the 2x2 output tile
float k0 = d01 + d11 + d21;
@@ -118,36 +127,43 @@
int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+ int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
#if defined(HAS_BIAS)
// Add bias
Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
- float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+ float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
out00 += (float)b;
out01 += (float)b;
#endif // defined(HAS_BIAS)
// Get output address
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
+#if defined(SRC_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
// Store the output tile
#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- *((__global float *)(dst_addr + 0 * dst_stride_y)) = out00;
- *((__global float *)(dst_addr + 1 * dst_stride_y)) = out01;
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = (DATA_TYPE)out00;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = (DATA_TYPE)out01;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- vstore2((float2)(out00, out01), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+ vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(out00, out01), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
#if defined(HAS_BIAS)
// Add bias
- out10 += (float)b;
- out11 += (float)b;
+ out10 += (DATA_TYPE)b;
+ out11 += (DATA_TYPE)b;
#endif // defined(HAS_BIAS)
- vstore2((float2)(out10, out11), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
+ vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))((DATA_TYPE)out10, (DATA_TYPE)out11), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
}
@@ -158,27 +174,32 @@
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void winograd_output_transform_4x4_3x3_nchw(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst)
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
#if defined(HAS_BIAS)
,
VECTOR_DECLARATION(bias)
@@ -186,17 +207,21 @@
)
{
// Each thread stores a 4x4/4x1 or 1x4 tile
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
+#if defined(SRC_DEPTH)
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else /* defined(SRC_DEPTH) */
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
// Load the values across the channels to compose the 6x6 or 6x1 tile
- float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
- float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
- float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
- float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
- float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
- float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
+ DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+ DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+ DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
// Compute out00, out01, out02 and out03
@@ -205,46 +230,46 @@
float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- float d10 = *((__global float *)(src_addr + 6 * src_stride_z));
- float d11 = *((__global float *)(src_addr + 7 * src_stride_z));
- float d12 = *((__global float *)(src_addr + 8 * src_stride_z));
- float d13 = *((__global float *)(src_addr + 9 * src_stride_z));
- float d14 = *((__global float *)(src_addr + 10 * src_stride_z));
- float d15 = *((__global float *)(src_addr + 11 * src_stride_z));
+ DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+ DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+ DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+ DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+ DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+ DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
- float d20 = *((__global float *)(src_addr + 12 * src_stride_z));
- float d21 = *((__global float *)(src_addr + 13 * src_stride_z));
- float d22 = *((__global float *)(src_addr + 14 * src_stride_z));
- float d23 = *((__global float *)(src_addr + 15 * src_stride_z));
- float d24 = *((__global float *)(src_addr + 16 * src_stride_z));
- float d25 = *((__global float *)(src_addr + 17 * src_stride_z));
+ DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+ DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+ DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+ DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+ DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+ DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
- float d30 = *((__global float *)(src_addr + 18 * src_stride_z));
- float d31 = *((__global float *)(src_addr + 19 * src_stride_z));
- float d32 = *((__global float *)(src_addr + 20 * src_stride_z));
- float d33 = *((__global float *)(src_addr + 21 * src_stride_z));
- float d34 = *((__global float *)(src_addr + 22 * src_stride_z));
- float d35 = *((__global float *)(src_addr + 23 * src_stride_z));
+ DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+ DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+ DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+ DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+ DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+ DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
- float d40 = *((__global float *)(src_addr + 24 * src_stride_z));
- float d41 = *((__global float *)(src_addr + 25 * src_stride_z));
- float d42 = *((__global float *)(src_addr + 26 * src_stride_z));
- float d43 = *((__global float *)(src_addr + 27 * src_stride_z));
- float d44 = *((__global float *)(src_addr + 28 * src_stride_z));
- float d45 = *((__global float *)(src_addr + 29 * src_stride_z));
+ DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+ DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+ DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+ DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+ DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+ DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
- float d50 = *((__global float *)(src_addr + 30 * src_stride_z));
- float d51 = *((__global float *)(src_addr + 31 * src_stride_z));
- float d52 = *((__global float *)(src_addr + 32 * src_stride_z));
- float d53 = *((__global float *)(src_addr + 33 * src_stride_z));
- float d54 = *((__global float *)(src_addr + 34 * src_stride_z));
- float d55 = *((__global float *)(src_addr + 35 * src_stride_z));
+ DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+ DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+ DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+ DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+ DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+ DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
// Compute out00, out01, out02 and out03
- float out00 = d01 + d21 + d41 + d11 + d31;
- float out01 = d01 + d21 + d41 + d11 + d31;
- float out02 = d01 + d21 + d41 + d11 + d31;
- float out03 = d01 + d21 + d41 + d11 + d31;
+ float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+ float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+ float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+ float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
@@ -301,12 +326,15 @@
int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+ int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
#if defined(HAS_BIAS)
// Add bias
Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
- float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+ float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
out00 += (float)b;
out01 += (float)b;
@@ -315,16 +343,20 @@
#endif // defined(HAS_BIAS)
// Get output address
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
+#if defined(SRC_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
// Store the output tile
#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- *((__global float *)(dst_addr + 0 * dst_stride_y)) = out00;
- *((__global float *)(dst_addr + 1 * dst_stride_y)) = out01;
- *((__global float *)(dst_addr + 2 * dst_stride_y)) = out02;
- *((__global float *)(dst_addr + 3 * dst_stride_y)) = out03;
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = (DATA_TYPE)out00;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = (DATA_TYPE)out01;
+ *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = (DATA_TYPE)out02;
+ *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = (DATA_TYPE)out03;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+ vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out00, (DATA_TYPE)out01, (DATA_TYPE)out02, (DATA_TYPE)out03), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
@@ -345,9 +377,9 @@
out32 += (float)b;
out33 += (float)b;
#endif // defined(HAS_BIAS)
- vstore4((float4)(out10, out11, out12, out13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
- vstore4((float4)(out20, out21, out22, out23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));
- vstore4((float4)(out30, out31, out32, out33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));
+ vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out10, (DATA_TYPE)out11, (DATA_TYPE)out12, (DATA_TYPE)out13), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+ vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out20, (DATA_TYPE)out21, (DATA_TYPE)out22, (DATA_TYPE)out23), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+ vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out30, (DATA_TYPE)out31, (DATA_TYPE)out32, (DATA_TYPE)out33), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
}
@@ -358,45 +390,54 @@
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] dst_size Size of the destination tensor, minus the last padding
*/
__kernel void winograd_output_transform_4x4_3x3_nhwc(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(bias),
#endif // defined(HAS_BIAS)
int dst_size)
{
// Each thread stores a 4x4/4x1 or 1x4 tile
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
+#if defined(SRC_DEPTH)
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else /* defined(SRC_DEPTH) */
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
// Load the values across the 36 channels to compose the 6x6 or 6x1 tile
- float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
- float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
- float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
- float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
- float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
- float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
+ DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+ DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+ DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
// Compute out00, out01, out02 and out03
@@ -406,40 +447,40 @@
float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- float d10 = *((__global float *)(src_addr + 6 * src_stride_z));
- float d11 = *((__global float *)(src_addr + 7 * src_stride_z));
- float d12 = *((__global float *)(src_addr + 8 * src_stride_z));
- float d13 = *((__global float *)(src_addr + 9 * src_stride_z));
- float d14 = *((__global float *)(src_addr + 10 * src_stride_z));
- float d15 = *((__global float *)(src_addr + 11 * src_stride_z));
+ DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+ DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+ DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+ DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+ DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+ DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
- float d20 = *((__global float *)(src_addr + 12 * src_stride_z));
- float d21 = *((__global float *)(src_addr + 13 * src_stride_z));
- float d22 = *((__global float *)(src_addr + 14 * src_stride_z));
- float d23 = *((__global float *)(src_addr + 15 * src_stride_z));
- float d24 = *((__global float *)(src_addr + 16 * src_stride_z));
- float d25 = *((__global float *)(src_addr + 17 * src_stride_z));
+ DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+ DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+ DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+ DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+ DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+ DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
- float d30 = *((__global float *)(src_addr + 18 * src_stride_z));
- float d31 = *((__global float *)(src_addr + 19 * src_stride_z));
- float d32 = *((__global float *)(src_addr + 20 * src_stride_z));
- float d33 = *((__global float *)(src_addr + 21 * src_stride_z));
- float d34 = *((__global float *)(src_addr + 22 * src_stride_z));
- float d35 = *((__global float *)(src_addr + 23 * src_stride_z));
+ DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+ DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+ DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+ DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+ DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+ DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
- float d40 = *((__global float *)(src_addr + 24 * src_stride_z));
- float d41 = *((__global float *)(src_addr + 25 * src_stride_z));
- float d42 = *((__global float *)(src_addr + 26 * src_stride_z));
- float d43 = *((__global float *)(src_addr + 27 * src_stride_z));
- float d44 = *((__global float *)(src_addr + 28 * src_stride_z));
- float d45 = *((__global float *)(src_addr + 29 * src_stride_z));
+ DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+ DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+ DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+ DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+ DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+ DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
- float d50 = *((__global float *)(src_addr + 30 * src_stride_z));
- float d51 = *((__global float *)(src_addr + 31 * src_stride_z));
- float d52 = *((__global float *)(src_addr + 32 * src_stride_z));
- float d53 = *((__global float *)(src_addr + 33 * src_stride_z));
- float d54 = *((__global float *)(src_addr + 34 * src_stride_z));
- float d55 = *((__global float *)(src_addr + 35 * src_stride_z));
+ DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+ DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+ DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+ DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+ DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+ DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
// Compute out00, out01, out02 and out03
float out00 = d01 + d21 + d41 + d11 + d31;
@@ -502,77 +543,88 @@
int x_out = get_global_id(0);
int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+#if defined(SRC_DEPTH)
+ int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
#if defined(HAS_BIAS)
// Add bias
Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
- float b = (float) * ((__global float *)(vector_offset(&bias, x_out)));
+ DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
- out00 += (float)b;
- out01 += (float)b;
- out02 += (float)b;
- out03 += (float)b;
+ out00 += (DATA_TYPE)b;
+ out01 += (DATA_TYPE)b;
+ out02 += (DATA_TYPE)b;
+ out03 += (DATA_TYPE)b;
#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- out10 += (float)b;
- out11 += (float)b;
- out12 += (float)b;
- out13 += (float)b;
+ out10 += (DATA_TYPE)b;
+ out11 += (DATA_TYPE)b;
+ out12 += (DATA_TYPE)b;
+ out13 += (DATA_TYPE)b;
- out20 += (float)b;
- out21 += (float)b;
- out22 += (float)b;
- out23 += (float)b;
+ out20 += (DATA_TYPE)b;
+ out21 += (DATA_TYPE)b;
+ out22 += (DATA_TYPE)b;
+ out23 += (DATA_TYPE)b;
- out30 += (float)b;
- out31 += (float)b;
- out32 += (float)b;
- out33 += (float)b;
+ out30 += (DATA_TYPE)b;
+ out31 += (DATA_TYPE)b;
+ out32 += (DATA_TYPE)b;
+ out33 += (DATA_TYPE)b;
#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
#endif // defined(HAS_BIAS)
#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
- offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+#if defined(SRC_DEPTH)
+ int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else /* defined(SRC_DEPTH) */
+ int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
+ offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
// Store the 1x4 output tile
- *((__global float *)(dst_ptr + offset.s0)) = out00;
- *((__global float *)(dst_ptr + offset.s1)) = out01;
- *((__global float *)(dst_ptr + offset.s2)) = out02;
- *((__global float *)(dst_ptr + offset.s3)) = out03;
+ *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = (DATA_TYPE)out00;
+ *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = (DATA_TYPE)out01;
+ *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = (DATA_TYPE)out02;
+ *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = (DATA_TYPE)out03;
#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
// Store the 4x1 output tile
- int offset = dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
+ int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
int mult_y = min(dst_size - offset, 1);
- *((__global float *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out00;
- *((__global float *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out01;
- *((__global float *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out02;
- *((__global float *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out03;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = (DATA_TYPE)out00;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = (DATA_TYPE)out01;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = (DATA_TYPE)out02;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = (DATA_TYPE)out03;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
// Get output address
- int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
+#if defined(SRC_DEPTH)
+ int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else /* defined(SRC_DEPTH) */
+ int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
- int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
+ int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
// Store the 4x4 output tile
- *((__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out00;
- *((__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out01;
- *((__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out02;
- *((__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out03;
- *((__global float *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out10;
- *((__global float *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out11;
- *((__global float *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out12;
- *((__global float *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out13;
- *((__global float *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out20;
- *((__global float *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out21;
- *((__global float *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out22;
- *((__global float *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out23;
- *((__global float *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out30;
- *((__global float *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out31;
- *((__global float *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out32;
- *((__global float *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out33;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = (DATA_TYPE)out00;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = (DATA_TYPE)out01;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = (DATA_TYPE)out02;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = (DATA_TYPE)out03;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = (DATA_TYPE)out10;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = (DATA_TYPE)out11;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = (DATA_TYPE)out12;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = (DATA_TYPE)out13;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = (DATA_TYPE)out20;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = (DATA_TYPE)out21;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = (DATA_TYPE)out22;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = (DATA_TYPE)out23;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = (DATA_TYPE)out30;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = (DATA_TYPE)out31;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = (DATA_TYPE)out32;
+ *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = (DATA_TYPE)out33;
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
}
@@ -601,27 +653,32 @@
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void winograd_output_transform_4x4_5x5_nchw(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst)
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
#if defined(HAS_BIAS)
,
VECTOR_DECLARATION(bias)
@@ -629,27 +686,38 @@
)
{
// Each thread stores a 4x4/4x1 or 1x4 tile
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
+#if defined(SRC_DEPTH)
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else /* defined(SRC_DEPTH) */
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
// Compute output address
int y_in = get_global_id(1);
int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+ int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
+#if defined(SRC_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else /* defined(SRC_DEPTH) */
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
// Load the values across the channels to compose the input tile
- float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
- float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
- float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
- float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
- float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
- float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
- float d06 = *((__global float *)(src_addr + 6 * src_stride_z));
- float d07 = *((__global float *)(src_addr + 7 * src_stride_z));
+ DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+ DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+ DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+ DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+ DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
// Compute out00, out01, out02 and out03
@@ -662,91 +730,93 @@
// Add bias
Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
- float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+ float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
- out00 += (float)b;
- out01 += (float)b;
- out02 += (float)b;
- out03 += (float)b;
+ out00 += (DATA_TYPE)b;
+ out01 += (DATA_TYPE)b;
+ out02 += (DATA_TYPE)b;
+ out03 += (DATA_TYPE)b;
#endif // defined(HAS_BIAS)
// Store the output tile
#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- *((__global float *)(dst_addr + 0 * dst_stride_y)) = out00;
- *((__global float *)(dst_addr + 1 * dst_stride_y)) = out01;
- *((__global float *)(dst_addr + 2 * dst_stride_y)) = out02;
- *((__global float *)(dst_addr + 3 * dst_stride_y)) = out03;
+ *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;
+ *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;
+ *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out02;
+ *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out03;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr));
+ vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out00, out01, out02, out03), 0, (__global DATA_TYPE *)(dst_addr));
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- float d10 = *((__global float *)(src_addr + 8 * src_stride_z));
- float d11 = *((__global float *)(src_addr + 9 * src_stride_z));
- float d12 = *((__global float *)(src_addr + 10 * src_stride_z));
- float d13 = *((__global float *)(src_addr + 11 * src_stride_z));
- float d14 = *((__global float *)(src_addr + 12 * src_stride_z));
- float d15 = *((__global float *)(src_addr + 13 * src_stride_z));
- float d16 = *((__global float *)(src_addr + 14 * src_stride_z));
- float d17 = *((__global float *)(src_addr + 15 * src_stride_z));
+ DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+ DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+ DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+ DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+ DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+ DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+ DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+ DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
- float d20 = *((__global float *)(src_addr + 16 * src_stride_z));
- float d21 = *((__global float *)(src_addr + 17 * src_stride_z));
- float d22 = *((__global float *)(src_addr + 18 * src_stride_z));
- float d23 = *((__global float *)(src_addr + 19 * src_stride_z));
- float d24 = *((__global float *)(src_addr + 20 * src_stride_z));
- float d25 = *((__global float *)(src_addr + 21 * src_stride_z));
- float d26 = *((__global float *)(src_addr + 22 * src_stride_z));
- float d27 = *((__global float *)(src_addr + 23 * src_stride_z));
+ DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+ DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+ DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+ DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+ DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+ DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+ DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+ DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
- float d30 = *((__global float *)(src_addr + 24 * src_stride_z));
- float d31 = *((__global float *)(src_addr + 25 * src_stride_z));
- float d32 = *((__global float *)(src_addr + 26 * src_stride_z));
- float d33 = *((__global float *)(src_addr + 27 * src_stride_z));
- float d34 = *((__global float *)(src_addr + 28 * src_stride_z));
- float d35 = *((__global float *)(src_addr + 29 * src_stride_z));
- float d36 = *((__global float *)(src_addr + 30 * src_stride_z));
- float d37 = *((__global float *)(src_addr + 31 * src_stride_z));
+ DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+ DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+ DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+ DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+ DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+ DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+ DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+ DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
- float d40 = *((__global float *)(src_addr + 32 * src_stride_z));
- float d41 = *((__global float *)(src_addr + 33 * src_stride_z));
- float d42 = *((__global float *)(src_addr + 34 * src_stride_z));
- float d43 = *((__global float *)(src_addr + 35 * src_stride_z));
- float d44 = *((__global float *)(src_addr + 36 * src_stride_z));
- float d45 = *((__global float *)(src_addr + 37 * src_stride_z));
- float d46 = *((__global float *)(src_addr + 38 * src_stride_z));
- float d47 = *((__global float *)(src_addr + 39 * src_stride_z));
+ DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+ DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+ DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+ DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+ DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
+ DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
+ DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
+ DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
- float d50 = *((__global float *)(src_addr + 40 * src_stride_z));
- float d51 = *((__global float *)(src_addr + 41 * src_stride_z));
- float d52 = *((__global float *)(src_addr + 42 * src_stride_z));
- float d53 = *((__global float *)(src_addr + 43 * src_stride_z));
- float d54 = *((__global float *)(src_addr + 44 * src_stride_z));
- float d55 = *((__global float *)(src_addr + 45 * src_stride_z));
- float d56 = *((__global float *)(src_addr + 46 * src_stride_z));
- float d57 = *((__global float *)(src_addr + 47 * src_stride_z));
+ DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
+ DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
+ DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
+ DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
+ DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
+ DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
+ DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
+ DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
- float d60 = *((__global float *)(src_addr + 48 * src_stride_z));
- float d61 = *((__global float *)(src_addr + 49 * src_stride_z));
- float d62 = *((__global float *)(src_addr + 50 * src_stride_z));
- float d63 = *((__global float *)(src_addr + 51 * src_stride_z));
- float d64 = *((__global float *)(src_addr + 52 * src_stride_z));
- float d65 = *((__global float *)(src_addr + 53 * src_stride_z));
- float d66 = *((__global float *)(src_addr + 54 * src_stride_z));
- float d67 = *((__global float *)(src_addr + 55 * src_stride_z));
+ DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
+ DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
+ DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
+ DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
+ DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
+ DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
+ DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
+ DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
- float d70 = *((__global float *)(src_addr + 56 * src_stride_z));
- float d71 = *((__global float *)(src_addr + 57 * src_stride_z));
- float d72 = *((__global float *)(src_addr + 58 * src_stride_z));
- float d73 = *((__global float *)(src_addr + 59 * src_stride_z));
- float d74 = *((__global float *)(src_addr + 60 * src_stride_z));
- float d75 = *((__global float *)(src_addr + 61 * src_stride_z));
- float d76 = *((__global float *)(src_addr + 62 * src_stride_z));
- float d77 = *((__global float *)(src_addr + 63 * src_stride_z));
+ DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
+ DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
+ DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
+ DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
+ DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
+ DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
+ DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
+ DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
// Compute the 8x4 intermediate tensor
- float4 comm_fact0, comm_fact1, comm_fact2;
- float4 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+ VEC_DATA_TYPE(float, 4)
+ comm_fact0, comm_fact1, comm_fact2;
+ VEC_DATA_TYPE(float, 4)
+ tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
@@ -762,33 +832,37 @@
comm_fact1 = tmp_col3 + tmp_col4;
comm_fact2 = tmp_col5 + tmp_col6;
- float4 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
- float4 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
+ VEC_DATA_TYPE(float, 4)
+ out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
+ VEC_DATA_TYPE(float, 4)
+ out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
comm_fact0 = tmp_col1 - tmp_col2;
comm_fact1 = tmp_col3 - tmp_col4;
comm_fact2 = tmp_col5 - tmp_col6;
- float4 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
- float4 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
+ VEC_DATA_TYPE(float, 4)
+ out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
+ VEC_DATA_TYPE(float, 4)
+ out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
#if defined(HAS_BIAS)
// Add bias
Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
- float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+ float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
- out_col0 += (float4)b;
- out_col1 += (float4)b;
- out_col2 += (float4)b;
- out_col3 += (float4)b;
+ out_col0 += (VEC_DATA_TYPE(float, 4))b;
+ out_col1 += (VEC_DATA_TYPE(float, 4))b;
+ out_col2 += (VEC_DATA_TYPE(float, 4))b;
+ out_col3 += (VEC_DATA_TYPE(float, 4))b;
#endif // defined(HAS_BIAS)
// Store the output tile
- vstore4((float4)(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
- vstore4((float4)(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
- vstore4((float4)(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), 0, (__global float *)(dst_addr + 2 * dst_stride_y));
- vstore4((float4)(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), 0, (__global float *)(dst_addr + 3 * dst_stride_y));
+ vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s0, (DATA_TYPE)out_col1.s0, (DATA_TYPE)out_col2.s0, (DATA_TYPE)out_col3.s0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+ vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s1, (DATA_TYPE)out_col1.s1, (DATA_TYPE)out_col2.s1, (DATA_TYPE)out_col3.s1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+ vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s2, (DATA_TYPE)out_col1.s2, (DATA_TYPE)out_col2.s2, (DATA_TYPE)out_col3.s2), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+ vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s3, (DATA_TYPE)out_col1.s3, (DATA_TYPE)out_col2.s3, (DATA_TYPE)out_col3.s3), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
}
@@ -799,51 +873,63 @@
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
* @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
* @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void winograd_output_transform_4x4_5x5_nhwc(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(bias),
#endif // defined(HAS_BIAS)
int dst_size)
{
// Each thread stores a 4x4/4x1 or 1x4 tile
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
+#if defined(SRC_DEPTH)
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else /* defined(SRC_DEPTH) */
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
int y_in = get_global_id(1);
int x_out = get_global_id(0);
int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+#if defined(SRC_DEPTH)
+ int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
// Load the values across the channels to compose the input tile
- float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
- float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
- float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
- float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
- float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
- float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
- float d06 = *((__global float *)(src_addr + 6 * src_stride_z));
- float d07 = *((__global float *)(src_addr + 7 * src_stride_z));
+ DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+ DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+ DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+ DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+ DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
// Compute out00, out01, out02 and out03
@@ -856,7 +942,7 @@
// Add bias
Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
- float b = (float) * ((__global float *)(vector_offset(&bias, x_out)));
+ float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
out00 += (float)b;
out01 += (float)b;
@@ -867,91 +953,97 @@
// Store the output tile
#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
// Get output address
- int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
- offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+#if defined(SRC_DEPTH)
+ int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else /* defined(SRC_DEPTH) */
+ int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
+ offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
- *(__global float *)(dst_ptr + offset.s0) = out00;
- *(__global float *)(dst_ptr + offset.s1) = out01;
- *(__global float *)(dst_ptr + offset.s2) = out02;
- *(__global float *)(dst_ptr + offset.s3) = out03;
+ *(__global DATA_TYPE *)(dst_ptr + offset.s0) = (DATA_TYPE)out00;
+ *(__global DATA_TYPE *)(dst_ptr + offset.s1) = (DATA_TYPE)out01;
+ *(__global DATA_TYPE *)(dst_ptr + offset.s2) = (DATA_TYPE)out02;
+ *(__global DATA_TYPE *)(dst_ptr + offset.s3) = (DATA_TYPE)out03;
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
// Get output address
- int offset = dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
+ int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
- *(__global float *)(dst_ptr + 0 * dst_stride_y + offset) = out00;
- *(__global float *)(dst_ptr + 1 * dst_stride_y + offset) = out01;
- *(__global float *)(dst_ptr + 2 * dst_stride_y + offset) = out02;
- *(__global float *)(dst_ptr + 3 * dst_stride_y + offset) = out03;
+ *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = (DATA_TYPE)out00;
+ *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = (DATA_TYPE)out01;
+ *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = (DATA_TYPE)out02;
+ *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = (DATA_TYPE)out03;
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- float d10 = *((__global float *)(src_addr + 8 * src_stride_z));
- float d11 = *((__global float *)(src_addr + 9 * src_stride_z));
- float d12 = *((__global float *)(src_addr + 10 * src_stride_z));
- float d13 = *((__global float *)(src_addr + 11 * src_stride_z));
- float d14 = *((__global float *)(src_addr + 12 * src_stride_z));
- float d15 = *((__global float *)(src_addr + 13 * src_stride_z));
- float d16 = *((__global float *)(src_addr + 14 * src_stride_z));
- float d17 = *((__global float *)(src_addr + 15 * src_stride_z));
+ DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+ DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+ DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+ DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+ DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+ DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+ DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+ DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
- float d20 = *((__global float *)(src_addr + 16 * src_stride_z));
- float d21 = *((__global float *)(src_addr + 17 * src_stride_z));
- float d22 = *((__global float *)(src_addr + 18 * src_stride_z));
- float d23 = *((__global float *)(src_addr + 19 * src_stride_z));
- float d24 = *((__global float *)(src_addr + 20 * src_stride_z));
- float d25 = *((__global float *)(src_addr + 21 * src_stride_z));
- float d26 = *((__global float *)(src_addr + 22 * src_stride_z));
- float d27 = *((__global float *)(src_addr + 23 * src_stride_z));
+ DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+ DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+ DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+ DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+ DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+ DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+ DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+ DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
- float d30 = *((__global float *)(src_addr + 24 * src_stride_z));
- float d31 = *((__global float *)(src_addr + 25 * src_stride_z));
- float d32 = *((__global float *)(src_addr + 26 * src_stride_z));
- float d33 = *((__global float *)(src_addr + 27 * src_stride_z));
- float d34 = *((__global float *)(src_addr + 28 * src_stride_z));
- float d35 = *((__global float *)(src_addr + 29 * src_stride_z));
- float d36 = *((__global float *)(src_addr + 30 * src_stride_z));
- float d37 = *((__global float *)(src_addr + 31 * src_stride_z));
+ DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+ DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+ DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+ DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+ DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+ DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+ DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+ DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
- float d40 = *((__global float *)(src_addr + 32 * src_stride_z));
- float d41 = *((__global float *)(src_addr + 33 * src_stride_z));
- float d42 = *((__global float *)(src_addr + 34 * src_stride_z));
- float d43 = *((__global float *)(src_addr + 35 * src_stride_z));
- float d44 = *((__global float *)(src_addr + 36 * src_stride_z));
- float d45 = *((__global float *)(src_addr + 37 * src_stride_z));
- float d46 = *((__global float *)(src_addr + 38 * src_stride_z));
- float d47 = *((__global float *)(src_addr + 39 * src_stride_z));
+ DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+ DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+ DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+ DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+ DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
+ DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
+ DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
+ DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
- float d50 = *((__global float *)(src_addr + 40 * src_stride_z));
- float d51 = *((__global float *)(src_addr + 41 * src_stride_z));
- float d52 = *((__global float *)(src_addr + 42 * src_stride_z));
- float d53 = *((__global float *)(src_addr + 43 * src_stride_z));
- float d54 = *((__global float *)(src_addr + 44 * src_stride_z));
- float d55 = *((__global float *)(src_addr + 45 * src_stride_z));
- float d56 = *((__global float *)(src_addr + 46 * src_stride_z));
- float d57 = *((__global float *)(src_addr + 47 * src_stride_z));
+ DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
+ DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
+ DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
+ DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
+ DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
+ DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
+ DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
+ DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
- float d60 = *((__global float *)(src_addr + 48 * src_stride_z));
- float d61 = *((__global float *)(src_addr + 49 * src_stride_z));
- float d62 = *((__global float *)(src_addr + 50 * src_stride_z));
- float d63 = *((__global float *)(src_addr + 51 * src_stride_z));
- float d64 = *((__global float *)(src_addr + 52 * src_stride_z));
- float d65 = *((__global float *)(src_addr + 53 * src_stride_z));
- float d66 = *((__global float *)(src_addr + 54 * src_stride_z));
- float d67 = *((__global float *)(src_addr + 55 * src_stride_z));
+ DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
+ DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
+ DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
+ DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
+ DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
+ DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
+ DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
+ DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
- float d70 = *((__global float *)(src_addr + 56 * src_stride_z));
- float d71 = *((__global float *)(src_addr + 57 * src_stride_z));
- float d72 = *((__global float *)(src_addr + 58 * src_stride_z));
- float d73 = *((__global float *)(src_addr + 59 * src_stride_z));
- float d74 = *((__global float *)(src_addr + 60 * src_stride_z));
- float d75 = *((__global float *)(src_addr + 61 * src_stride_z));
- float d76 = *((__global float *)(src_addr + 62 * src_stride_z));
- float d77 = *((__global float *)(src_addr + 63 * src_stride_z));
+ DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
+ DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
+ DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
+ DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
+ DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
+ DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
+ DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
+ DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
// Compute the 8x4 intermediate tensor
- float4 comm_fact0, comm_fact1, comm_fact2;
- float4 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+ VEC_DATA_TYPE(float, 4)
+ comm_fact0, comm_fact1, comm_fact2;
+ VEC_DATA_TYPE(float, 4)
+ tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
@@ -967,49 +1059,57 @@
comm_fact1 = tmp_col3 + tmp_col4;
comm_fact2 = tmp_col5 + tmp_col6;
- float4 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
- float4 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
+ VEC_DATA_TYPE(float, 4)
+ out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
+ VEC_DATA_TYPE(float, 4)
+ out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
comm_fact0 = tmp_col1 - tmp_col2;
comm_fact1 = tmp_col3 - tmp_col4;
comm_fact2 = tmp_col5 - tmp_col6;
- float4 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
- float4 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
+ VEC_DATA_TYPE(float, 4)
+ out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
+ VEC_DATA_TYPE(float, 4)
+ out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
#if defined(HAS_BIAS)
// Add bias
Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
- float b = (float) * ((__global float *)(vector_offset(&bias, x_out)));
+ DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
- out_col0 += (float4)b;
- out_col1 += (float4)b;
- out_col2 += (float4)b;
- out_col3 += (float4)b;
+ out_col0 += (VEC_DATA_TYPE(float, 4))b;
+ out_col1 += (VEC_DATA_TYPE(float, 4))b;
+ out_col2 += (VEC_DATA_TYPE(float, 4))b;
+ out_col3 += (VEC_DATA_TYPE(float, 4))b;
#endif // defined(HAS_BIAS)
// Get output address
- int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
+#if defined(SRC_DEPTH)
+ int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else /* defined(SRC_DEPTH) */
+ int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
int4 mult_y = min((int4)dst_size - offset, (int4)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
// Store the output tile
- *(__global float *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0.s0;
- *(__global float *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1.s0;
- *(__global float *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2.s0;
- *(__global float *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3.s0;
- *(__global float *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0.s1;
- *(__global float *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1.s1;
- *(__global float *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2.s1;
- *(__global float *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3.s1;
- *(__global float *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0.s2;
- *(__global float *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1.s2;
- *(__global float *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2.s2;
- *(__global float *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3.s2;
- *(__global float *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0.s3;
- *(__global float *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1.s3;
- *(__global float *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2.s3;
- *(__global float *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3.s3;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col0.s0;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col1.s0;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col2.s0;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col3.s0;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col0.s1;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col1.s1;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col2.s1;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col3.s1;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col0.s2;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col1.s2;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col2.s2;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col3.s2;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col0.s3;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col1.s3;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col2.s3;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col3.s3;
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
}
@@ -1020,27 +1120,32 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void winograd_output_transform_2x1_3x1_nchw(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst)
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
#if defined(HAS_BIAS)
,
VECTOR_DECLARATION(bias)
@@ -1054,6 +1159,8 @@
src_step_y,
src_stride_z,
src_step_z,
+ src_stride_w,
+ src_step_w,
src_offset_first_element_in_bytes,
dst_ptr,
dst_stride_x,
@@ -1062,6 +1169,8 @@
dst_step_y,
dst_stride_z,
dst_step_z,
+ dst_stride_w,
+ dst_step_w,
dst_offset_first_element_in_bytes
#if defined(HAS_BIAS)
,
@@ -1079,27 +1188,32 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void winograd_output_transform_4x1_3x1_nchw(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst)
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
#if defined(HAS_BIAS)
,
VECTOR_DECLARATION(bias)
@@ -1113,6 +1227,8 @@
src_step_y,
src_stride_z,
src_step_z,
+ src_stride_w,
+ src_step_w,
src_offset_first_element_in_bytes,
dst_ptr,
dst_stride_x,
@@ -1121,6 +1237,8 @@
dst_step_y,
dst_stride_z,
dst_step_z,
+ dst_stride_w,
+ dst_step_w,
dst_offset_first_element_in_bytes
#if defined(HAS_BIAS)
,
@@ -1138,27 +1256,32 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void winograd_output_transform_4x1_5x1_nchw(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst)
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
#if defined(HAS_BIAS)
,
VECTOR_DECLARATION(bias)
@@ -1172,6 +1295,8 @@
src_step_y,
src_stride_z,
src_step_z,
+ src_stride_w,
+ src_step_w,
src_offset_first_element_in_bytes,
dst_ptr,
dst_stride_x,
@@ -1180,6 +1305,8 @@
dst_step_y,
dst_stride_z,
dst_step_z,
+ dst_stride_w,
+ dst_step_w,
dst_offset_first_element_in_bytes
#if defined(HAS_BIAS)
,
@@ -1197,27 +1324,32 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void winograd_output_transform_4x1_3x1_nhwc(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(bias),
#endif // defined(HAS_BIAS)
@@ -1230,6 +1362,8 @@
src_step_y,
src_stride_z,
src_step_z,
+ src_stride_w,
+ src_step_w,
src_offset_first_element_in_bytes,
dst_ptr,
dst_stride_x,
@@ -1238,6 +1372,8 @@
dst_step_y,
dst_stride_z,
dst_step_z,
+ dst_stride_w,
+ dst_step_w,
dst_offset_first_element_in_bytes,
#if defined(HAS_BIAS)
bias_ptr,
@@ -1254,27 +1390,32 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void winograd_output_transform_4x1_5x1_nhwc(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(bias),
#endif // defined(HAS_BIAS)
@@ -1287,6 +1428,8 @@
src_step_y,
src_stride_z,
src_step_z,
+ src_stride_w,
+ src_step_w,
src_offset_first_element_in_bytes,
dst_ptr,
dst_stride_x,
@@ -1295,6 +1438,8 @@
dst_step_y,
dst_stride_z,
dst_step_z,
+ dst_stride_w,
+ dst_step_w,
dst_offset_first_element_in_bytes,
#if defined(HAS_BIAS)
bias_ptr,
@@ -1313,27 +1458,32 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void winograd_output_transform_1x2_1x3_nchw(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst)
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
#if defined(HAS_BIAS)
,
VECTOR_DECLARATION(bias)
@@ -1347,6 +1497,8 @@
src_step_y,
src_stride_z,
src_step_z,
+ src_stride_w,
+ src_step_w,
src_offset_first_element_in_bytes,
dst_ptr,
dst_stride_x,
@@ -1355,6 +1507,8 @@
dst_step_y,
dst_stride_z,
dst_step_z,
+ dst_stride_w,
+ dst_step_w,
dst_offset_first_element_in_bytes
#if defined(HAS_BIAS)
,
@@ -1372,27 +1526,32 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void winograd_output_transform_1x4_1x3_nchw(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst)
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
#if defined(HAS_BIAS)
,
VECTOR_DECLARATION(bias)
@@ -1406,6 +1565,8 @@
src_step_y,
src_stride_z,
src_step_z,
+ src_stride_w,
+ src_step_w,
src_offset_first_element_in_bytes,
dst_ptr,
dst_stride_x,
@@ -1414,6 +1575,8 @@
dst_step_y,
dst_stride_z,
dst_step_z,
+ dst_stride_w,
+ dst_step_w,
dst_offset_first_element_in_bytes
#if defined(HAS_BIAS)
,
@@ -1431,27 +1594,32 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void winograd_output_transform_1x4_1x5_nchw(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst)
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst)
#if defined(HAS_BIAS)
,
VECTOR_DECLARATION(bias)
@@ -1465,6 +1633,8 @@
src_step_y,
src_stride_z,
src_step_z,
+ src_stride_w,
+ src_step_w,
src_offset_first_element_in_bytes,
dst_ptr,
dst_stride_x,
@@ -1473,6 +1643,8 @@
dst_step_y,
dst_stride_z,
dst_step_z,
+ dst_stride_w,
+ dst_step_w,
dst_offset_first_element_in_bytes
#if defined(HAS_BIAS)
,
@@ -1490,27 +1662,32 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void winograd_output_transform_1x4_1x3_nhwc(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(bias),
#endif // defined(HAS_BIAS)
@@ -1523,6 +1700,8 @@
src_step_y,
src_stride_z,
src_step_z,
+ src_stride_w,
+ src_step_w,
src_offset_first_element_in_bytes,
dst_ptr,
dst_stride_x,
@@ -1531,6 +1710,8 @@
dst_step_y,
dst_stride_z,
dst_step_z,
+ dst_stride_w,
+ dst_step_w,
dst_offset_first_element_in_bytes,
#if defined(HAS_BIAS)
bias_ptr,
@@ -1547,27 +1728,32 @@
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void winograd_output_transform_1x4_1x5_nhwc(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
#if defined(HAS_BIAS)
VECTOR_DECLARATION(bias),
#endif // defined(HAS_BIAS)
@@ -1580,6 +1766,8 @@
src_step_y,
src_stride_z,
src_step_z,
+ src_stride_w,
+ src_step_w,
src_offset_first_element_in_bytes,
dst_ptr,
dst_stride_x,
@@ -1588,6 +1776,8 @@
dst_step_y,
dst_stride_z,
dst_step_z,
+ dst_stride_w,
+ dst_step_w,
dst_offset_first_element_in_bytes,
#if defined(HAS_BIAS)
bias_ptr,
diff --git a/src/core/CL/cl_kernels/yolo_layer.cl b/src/core/CL/cl_kernels/yolo_layer.cl
new file mode 100644
index 0000000..2240d7c
--- /dev/null
+++ b/src/core/CL/cl_kernels/yolo_layer.cl
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(ACT) && defined(NUM_CLASSES) && defined(VEC_SIZE)
+
+#if VEC_SIZE != 1
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define SELECT_TYPE VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+
+#include "activation_helpers.h"
+
+/** This performs a YOLO partial activation function for NCHW data layout
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note The number of classes should be given as a preprocessor argument using -DNUM_CLASSES=num. e.g. -DNUM_CLASSES=80
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void yolo_layer_nchw(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ const int box_ch_id = get_global_id(2) % (NUM_CLASSES + 5);
+ const bool activate = box_ch_id != 2 && box_ch_id != 3;
+
+ if(activate)
+ {
+ // Load data
+ TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+ data = ACTIVATION_OP(ACT, data); // select(1.0f, ACTIVATION_OP(ACT, data), (SELECT_TYPE)activate);
+
+ // Store result
+ VSTORE(VEC_SIZE)
+ (data, 0, (__global DATA_TYPE *)output.ptr);
+ }
+#ifndef IN_PLACE
+ else
+ {
+ // Load data
+ TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+ // Store result
+ VSTORE(VEC_SIZE)
+ (data, 0, (__global DATA_TYPE *)output.ptr);
+ }
+#endif // IN_PLACE
+}
+
+#else // VEC_SIZE != 1
+
+#define TYPE DATA_TYPE
+#define SELECT_TYPE SELECT_DATA_TYPE
+
+#include "activation_helpers.h"
+
+/** This performs a YOLO partial activation function for NCHW data layout
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=1
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note The number of classes should be given as a preprocessor argument using -DNUM_CLASSES=num. e.g. -DNUM_CLASSES=80
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void yolo_layer_nhwc(
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+ const int box_ch_id = get_global_id(0) % (NUM_CLASSES + 5);
+ const bool activate = box_ch_id != 2 && box_ch_id != 3;
+
+ if(activate)
+ {
+ // Load data
+ DATA_TYPE data = *((__global DATA_TYPE *)input.ptr);
+ data = select(data, ACTIVATION_OP(ACT, data), (SELECT_TYPE)activate);
+
+ // Store result
+ *((__global DATA_TYPE *)output.ptr) = data;
+ }
+#ifndef IN_PLACE
+ else
+ {
+ // Load data
+ DATA_TYPE data = *((__global DATA_TYPE *)input.ptr);
+
+ // Store result
+ *((__global DATA_TYPE *)output.ptr) = data;
+ }
+#endif // IN_PLACE
+}
+
+#endif // VEC_SIZE != 1
+#endif // defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(ACT) && defined(NUM_CLASSES) && defined(VEC_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index a15e99b..73a4d7d 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -133,6 +133,7 @@
std::set<std::string> build_opts;
build_opts.emplace(("-DACT=" + lower_string(string_from_activation_func(act_info.activation()))));
build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+ build_opts.emplace(("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(dt)));
build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
if(is_data_type_quantized(dt))
diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
index 2372d45..10d7fd4 100644
--- a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
@@ -31,7 +31,7 @@
namespace
{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
+constexpr unsigned int num_elems_processed_per_iteration = 8;
Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
{
@@ -140,6 +140,7 @@
build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+ build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
if(is_data_type_quantized_asymmetric(input1->info()->data_type()))
{
build_opts.emplace("-DOFFSET_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().offset));
@@ -155,11 +156,22 @@
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += (policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
+ _config_id += lower_string(string_from_data_layout(input1->info()->data_layout()));
}
Status CLArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
@@ -176,8 +188,9 @@
const TensorShape &in_shape2 = _input2->info()->tensor_shape();
const TensorShape &out_shape = _output->info()->tensor_shape();
- bool can_collapse = true;
- if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ bool can_collapse = true;
+ const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
+ if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
{
can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
@@ -204,7 +217,7 @@
add_3D_tensor_argument(idx, _input2, slice_input2);
add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
+ enqueue(queue, *this, slice, lws_hint());
collapsed.slide_window_slice_3D(slice_input1);
collapsed.slide_window_slice_3D(slice_input2);
diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
index 299ac55..95d2011 100644
--- a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
@@ -36,45 +36,82 @@
#include <set>
#include <string>
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
{
ARM_COMPUTE_UNUSED(policy);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input2);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input2);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+ const bool is_qasymm = is_data_type_quantized_asymmetric(input1.data_type()) || is_data_type_quantized_asymmetric(input2.data_type());
+ if(is_qasymm)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+ }
+
+ const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
// Validate in case of configured output
- if((output != nullptr) && (output->total_size() != 0))
+ if(output.total_size() > 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) || (input2.data_type() != DataType::U8)),
"Output can only be U8 if both inputs are U8");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+ "Wrong shape for output");
+ if(is_qasymm)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
+ }
}
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
{
- constexpr unsigned int num_elems_processed_per_iteration = 16;
+ const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
- Window win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(output, out_shape);
- bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
+ if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
+ {
+ set_format_if_unknown(output, Format::S16);
+ }
+ else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
+ {
+ set_format_if_unknown(output, Format::F16);
+ }
+ else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
+ {
+ set_format_if_unknown(output, Format::F32);
+ }
+ }
- ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
- input2->valid_region());
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(input1);
+ Window win_input2 = win.broadcast_if_dimension_le_one(input2);
+
+ AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access)
+ || update_window_and_padding(win_input2, input2_access)
+ || update_window_and_padding(win, output_access);
output_access.set_valid_region(win, valid_region);
@@ -91,22 +128,11 @@
void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
-
- if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
- {
- set_format_if_unknown(*output->info(), Format::S16);
- }
- else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output->info(), Format::F32);
- }
- }
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), policy));
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
_input1 = input1;
_input2 = input2;
@@ -114,26 +140,39 @@
bool has_float_out = is_data_type_float(output->info()->data_type());
+ // Setup kernel
+ std::string kernel_name = "arithmetic_sub";
+
// Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
- build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+ CLBuildOptions build_opts;
+ build_opts.add_option_if_else(policy == ConvertPolicy::WRAP || has_float_out, "-DWRAP", "-DSATURATE");
+ build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+ build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+ if(is_data_type_quantized_asymmetric(input1->info()->data_type()))
+ {
+ build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().offset));
+ build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().offset));
+ build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(output->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().scale));
+ build_opts.add_option("-DSCALE_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().scale));
+ build_opts.add_option("-DSCALE_OUT=" + support::cpp11::to_string(output->info()->quantization_info().scale));
+ kernel_name += "_quantized";
+ }
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_sub", build_opts));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
// Configure kernel window
- auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
}
Status CLArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
return Status{};
}
@@ -143,16 +182,51 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = collapsed.first_slice_window_3D();
+ const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+ const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+ const TensorShape &out_shape = _output->info()->tensor_shape();
+
+ // Collapse only if broadcast dimensions is less than 2, or in case of no broadcasting
+ bool can_collapse = true;
+ if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+ {
+ can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+ {
+ can_collapse = (in_shape1[d] == in_shape2[d]);
+ }
+ }
+
+ bool has_collapsed = false;
+ Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+ const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+ Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
do
{
unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input1, slice);
- add_3D_tensor_argument(idx, _input2, slice);
+
+ add_3D_tensor_argument(idx, _input1, slice_input1);
+ add_3D_tensor_argument(idx, _input2, slice_input2);
add_3D_tensor_argument(idx, _output, slice);
+
enqueue(queue, *this, slice);
+
+ collapsed.slide_window_slice_3D(slice_input1);
+ collapsed.slide_window_slice_3D(slice_input2);
}
while(collapsed.slide_window_slice_3D(slice));
}
+
+BorderSize CLArithmeticSubtractionKernel::border_size() const
+{
+ const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index d4a7207..07bcb75 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -159,6 +159,7 @@
// Set build options
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
build_opts.add_option_if(act_info.enabled(), "-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
@@ -192,8 +193,6 @@
ICLKernel::configure_internal(win_config.second);
_config_id = "batch_normalization_layer_";
- _config_id += string_from_data_layout(input->info()->data_layout());
- _config_id += "_";
_config_id += string_from_data_type(input->info()->data_type());
_config_id += "_";
_config_id += support::cpp11::to_string(input->info()->dimension(0));
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
new file mode 100644
index 0000000..58a8d10
--- /dev/null
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+using namespace arm_compute::misc::shape_calculator;
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+ // Validate output if initialized
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x <= 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(block_shape_y <= 0);
+
+ const DataLayout data_layout = input->data_layout();
+ const int idx_batch = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
+
+ // Validate output if initialized
+ if(output->total_size() != 0)
+ {
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape_x * input->tensor_shape()[idx_width]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape_x * input->tensor_shape()[idx_height]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] != input->tensor_shape()[idx_channel]);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel()
+ : _input(nullptr), _block_shape(nullptr), _output(nullptr)
+{
+}
+
+void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info()));
+
+ _input = input;
+ _block_shape = block_shape;
+ _output = output;
+
+ const int idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(input->info()->dimension(3)));
+ build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+ ICLKernel::configure_internal(win);
+}
+
+void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ TensorShape output_shape = compute_batch_to_space_shape(input->info(), block_shape_x, block_shape_y);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info()));
+
+ _input = input;
+ _output = output;
+
+ const int idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(input->info()->dimension(3)));
+ build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
+ build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
+ build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+ ICLKernel::configure_internal(win);
+}
+
+Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
+ return Status{};
+}
+
+Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output));
+ return Status{};
+}
+
+void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_3D();
+ Window slice_out = window.first_slice_window_4D();
+
+ Window vector_slice = window.first_slice_window_1D();
+ vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_out.set(3, Window::Dimension(0, 0, 0));
+
+ int batch_id = 0;
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_argument(idx, batch_id);
+ if(_block_shape != nullptr)
+ {
+ add_1D_tensor_argument(idx, _block_shape, vector_slice);
+ }
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_in);
+
+ ++batch_id;
+ }
+ while(window.slide_window_slice_3D(slice_in));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
new file mode 100644
index 0000000..bff28e3
--- /dev/null
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(boxes, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(deltas, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON(deltas->tensor_shape()[1] != boxes->tensor_shape()[1]);
+ ARM_COMPUTE_RETURN_ERROR_ON(deltas->tensor_shape()[0] % 4 != 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(deltas->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
+
+ if(pred_boxes->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, deltas);
+ ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(info.scale() <= 0);
+ return Status{};
+}
+} // namespace
+
+CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel()
+ : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
+{
+}
+
+void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
+ auto_init_if_empty(*pred_boxes->info(), *deltas->info());
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
+
+ // Set instance variables
+ _boxes = boxes;
+ _pred_boxes = pred_boxes;
+ _deltas = deltas;
+
+ // Get image height and widht (rescaled)
+ const int img_h = floor(info.img_height() / info.scale() + 0.5f);
+ const int img_w = floor(info.img_width() / info.scale() + 0.5f);
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(deltas->info()->data_type()));
+ build_opts.add_option("-DWEIGHT_X=" + float_to_string_with_full_precision(info.weights()[0]));
+ build_opts.add_option("-DWEIGHT_Y=" + float_to_string_with_full_precision(info.weights()[1]));
+ build_opts.add_option("-DWEIGHT_W=" + float_to_string_with_full_precision(info.weights()[2]));
+ build_opts.add_option("-DWEIGHT_H=" + float_to_string_with_full_precision(info.weights()[3]));
+ build_opts.add_option("-DBBOX_XFORM_CLIP=" + float_to_string_with_full_precision(info.bbox_xform_clip()));
+ build_opts.add_option("-DIMG_WIDTH=" + support::cpp11::to_string(img_w));
+ build_opts.add_option("-DIMG_HEIGHT=" + support::cpp11::to_string(img_h));
+ build_opts.add_option("-DBOX_FIELDS=" + support::cpp11::to_string(4));
+ build_opts.add_option("-DSCALE_BEFORE=" + float_to_string_with_full_precision(info.scale()));
+ build_opts.add_option_if(info.apply_scale(), "-DSCALE_AFTER=" + float_to_string_with_full_precision(info.scale()));
+ build_opts.add_option_if(info.correct_transform_coords(), "-DOFFSET=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bounding_box_transform", build_opts.options()));
+
+ // Since the number of columns is a multiple of 4 by definition, we don't need to pad the tensor
+ const unsigned int num_elems_processed_per_iteration = 4;
+ Window win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
+}
+
+Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
+ return Status{};
+}
+
+void CLBoundingBoxTransformKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ // Set arguments
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _boxes, slice);
+ add_2D_tensor_argument(idx, _pred_boxes, slice);
+ add_2D_tensor_argument(idx, _deltas, slice);
+
+ // Note that we don't need to loop over the slices, as we are sure that we are dealing with all 2D tensors
+ enqueue(queue, *this, slice);
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index be4d687..53a5456 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -67,18 +67,22 @@
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output, *input->clone());
- const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
+ const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
+ const unsigned int num_elems_processed_per_iteration_x = is_nhwc ? 4 : max_cl_vector_width / input->element_size();
+ constexpr unsigned int num_elems_processed_per_iteration_y = 2;
// Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
- AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
const bool window_changed = update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, input->valid_region());
+ Window win_collapsed = win.collapse(win, Window::DimZ);
+
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return std::make_pair(err, win_collapsed);
}
} // namespace
@@ -96,14 +100,19 @@
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
- const unsigned int channels = input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL));
- const unsigned int block_size = max_cl_vector_width / input->info()->element_size();
+ const DataLayout data_layout = input->info()->data_layout();
+ const bool is_nhwc = data_layout == DataLayout::NHWC;
+ const unsigned int channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+ const unsigned int vec_size = is_nhwc ? 4 : max_cl_vector_width / input->info()->element_size();
// Set kernel build options
CLBuildOptions build_opts;
build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
build_opts.add_option("-DK=" + support::cpp11::to_string(channels / num_groups));
- build_opts.add_option("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(channels - vec_size), 0)));
+
switch(input->info()->element_size())
{
case 1:
@@ -120,12 +129,33 @@
}
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("channel_shuffle_nchw", build_opts.options()));
+ std::string kernel_name = "channel_shuffle_" + lower_string(string_from_data_layout(data_layout));
+ ;
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), output->info());
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(num_groups);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(2));
}
Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
@@ -141,14 +171,9 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window slice = window.first_slice_window_3D();
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
- }
- while(window.slide_window_slice_3D(slice));
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, window);
+ add_4D_tensor_argument(idx, _output, window);
+ enqueue(queue, *this, window, lws_hint());
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index 40032f9..d748745 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -40,7 +40,7 @@
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -49,7 +49,7 @@
// Checks performed when output is configured
if(output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, num_groups));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, true, num_groups));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW");
@@ -58,11 +58,11 @@
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, num_groups)).set_data_layout(DataLayout::NCHW));
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW));
const unsigned int num_elems_read_per_iteration = 8;
@@ -87,7 +87,7 @@
{
}
-void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups)
+void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -105,8 +105,8 @@
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(input->info()->dimension(0)));
- build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.first));
- build_opts.add_option_if(num_groups > 1, "-DGROUPING");
+ build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.width));
+ build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts.options()));
@@ -130,7 +130,7 @@
_config_id += support::cpp11::to_string(output->info()->dimension(1));
}
-Status CLCol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups)
+Status CLCol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims, num_groups));
@@ -143,22 +143,26 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+ bool is_collapsed = false;
+ bool is_collapsed_out = false;
+
Window out_window;
out_window.use_tensor_dimensions(_output->info()->tensor_shape());
- Window slice = window.first_slice_window_3D();
- Window slice_out = out_window.first_slice_window_3D();
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &is_collapsed);
+ Window collapsed_out = out_window.collapse_if_possible(out_window, 3, &is_collapsed_out);
- unsigned int idx = 2 * num_arguments_per_3D_tensor();
- _kernel.setArg<cl_uint>(idx++, _output->info()->strides_in_bytes()[3]);
+ ARM_COMPUTE_ERROR_ON(is_collapsed != is_collapsed_out);
+ Window slice = collapsed.first_slice_window_3D();
+ Window slice_out = collapsed_out.first_slice_window_4D();
do
{
// Set inputs
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice_out);
+ add_4D_tensor_argument(idx, _output, slice_out);
enqueue(queue, *this, slice, lws_hint());
}
- while(window.slide_window_slice_3D(slice) && out_window.slide_window_slice_3D(slice_out));
+ while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
}
diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp
index e79019e..4f178c9 100644
--- a/src/core/CL/kernels/CLColorConvertKernel.cpp
+++ b/src/core/CL/kernels/CLColorConvertKernel.cpp
@@ -61,6 +61,7 @@
num_elems_processed_per_iteration = 16;
break;
default:
+ ARM_COMPUTE_ERROR("Not supported");
break;
}
break;
@@ -75,6 +76,7 @@
num_elems_processed_per_iteration = 8;
break;
default:
+ ARM_COMPUTE_ERROR("Not supported");
break;
}
break;
@@ -84,9 +86,11 @@
switch(output->info()->format())
{
case Format::RGBA8888:
+ case Format::U8:
num_elems_processed_per_iteration = 16;
break;
default:
+ ARM_COMPUTE_ERROR("Not supported");
break;
}
break;
@@ -143,6 +147,7 @@
num_elems_processed_per_iteration = 4;
break;
default:
+ ARM_COMPUTE_ERROR("Not supported");
break;
}
break;
@@ -220,6 +225,7 @@
num_elems_read_per_iteration_x = 16;
break;
default:
+ ARM_COMPUTE_ERROR("Not supported");
break;
}
break;
@@ -235,6 +241,7 @@
num_elems_read_per_iteration_x = 8;
break;
default:
+ ARM_COMPUTE_ERROR("Not supported");
break;
}
break;
@@ -303,6 +310,7 @@
num_elems_processed_per_iteration = 16;
break;
default:
+ ARM_COMPUTE_ERROR("Not supported");
break;
}
break;
@@ -316,6 +324,7 @@
num_elems_processed_per_iteration = 16;
break;
default:
+ ARM_COMPUTE_ERROR("Not supported");
break;
}
break;
diff --git a/src/core/CL/kernels/CLCopyKernel.cpp b/src/core/CL/kernels/CLCopyKernel.cpp
index 2da67d2..e14e5da 100644
--- a/src/core/CL/kernels/CLCopyKernel.cpp
+++ b/src/core/CL/kernels/CLCopyKernel.cpp
@@ -30,21 +30,22 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
// Validate output if initialized
if(output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding), output->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -69,6 +70,64 @@
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
+
+std::pair<Status, Window> validate_and_configure_window_with_padding(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding)
+{
+ TensorShape input_shape = input->tensor_shape();
+ TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input_shape, padding);
+
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(padded_shape));
+
+ // Configure window
+ const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+ // Pad on the x dimension accounting for the padding offset along the same dimension
+ AccessWindowHorizontal output_access(output, padding[0].first, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+
+/** Generate the string "-DPAD= @p dim @p index @p padding"
+ *
+ * @param[in] dim The dimension index
+ * @param[in] index Can be 0 for the start dimension and 1 for the end dimension
+ * @param[in] padding The value to pad for that index/dimension pair
+ *
+ * @return The correct concatenated string
+ */
+std::string generate_pad_string(const size_t dim, const size_t index, const size_t padding)
+{
+ return "-DPAD" + support::cpp11::to_string(dim) + support::cpp11::to_string(index) + "=" + support::cpp11::to_string(padding);
+}
+
+/** Pass the padding as build option to the kernel.
+ *
+ * @param[in] tensor The padded tensor
+ * @param[in] padding The list of the padding for each dimension
+ * @param[out] build_opts The build option to which adding the padding
+ */
+void add_padding_as_build_options(const PaddingList &padding, CLBuildOptions &build_opts)
+{
+ size_t dim = 0;
+ for(dim = 0; dim < padding.size(); dim++)
+ {
+ build_opts.add_option(generate_pad_string(dim, 0, padding[dim].first));
+ build_opts.add_option(generate_pad_string(dim, 1, padding[dim].second));
+ }
+
+ while(dim < TensorShape::num_max_dimensions)
+ {
+ build_opts.add_option(generate_pad_string(dim, 0, 0));
+ build_opts.add_option(generate_pad_string(dim, 1, 0));
+ dim++;
+ }
+}
+
} // namespace
CLCopyKernel::CLCopyKernel()
@@ -76,32 +135,68 @@
{
}
-void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output)
+void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding));
_input = input;
_output = output;
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-
// Create kernel
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_tensor", build_opts.options()));
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info());
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+
+ std::pair<Status, Window> win_config;
+
+ if(padding.empty())
+ {
+ // Build kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_tensor", build_opts.options()));
+
+ // Configure window
+ win_config = validate_and_configure_window(input->info(), output->info());
+ }
+ else
+ {
+ // Add compile time options
+ add_padding_as_build_options(padding, build_opts);
+
+ // If we are padding in the fourth dimension the kernel needs to know the depth of the
+ // different cubes
+ if(padding.size() == 4)
+ {
+ const size_t depth = input->info()->tensor_shape()[2];
+ build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth));
+ }
+
+ // Build kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_pad_tensor", build_opts.options()));
+
+ // Configure window
+ win_config = validate_and_configure_window_with_padding(input->info(), output->info(), padding);
+ }
+
+ // Validate and set the window
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
}
-Status CLCopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
+Status CLCopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, const PaddingList &padding)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding));
+
+ if(padding.empty())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(input->clone().get(), output->clone().get(), padding).first);
+ }
return Status{};
}
@@ -123,3 +218,4 @@
}
while(collapsed.slide_window_slice_3D(slice));
}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index c6a0031..dd7d790 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -43,13 +43,21 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) == 0);
+
+ const DataLayout data_layout = input->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
- for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
+ for(size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
}
@@ -93,28 +101,61 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+ const DataLayout data_layout = _input->info()->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
const int out_start_x = _info.pad().first;
- const int out_end_x = _output->info()->dimension(0) - _inner_border.right - _info.pad().first + _info.stride().first - 1;
+ const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right - _info.pad().first + _info.stride().first - 1;
const int out_step_x = _info.stride().first;
const int out_start_y = _inner_border.top + _info.pad().second;
- const int out_end_y = _output->info()->dimension(1) - _info.pad().second + _info.stride().second - 1;
+ const int out_end_y = _output->info()->dimension(idx_h) - _info.pad().second + _info.stride().second - 1;
const int out_step_y = _info.stride().second;
- Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
- Window slice_out = collapsed.first_slice_window_3D();
- slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
- slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
-
- Window slice_in = collapsed.first_slice_window_3D();
-
- do
+ switch(data_layout)
{
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_out);
+ case DataLayout::NCHW:
+ {
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+ Window slice_out = collapsed.first_slice_window_3D();
+ slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
+ slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+ Window slice_in = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ }
+ while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ // NOTE: not collapsing in NHWC
+ Window slice_out = window.first_slice_window_3D();
+ slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x));
+ slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+ Window slice_in = window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ }
+ while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data layout");
}
- while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
}
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index a40aa28..eb561fa 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
@@ -207,8 +207,7 @@
}
void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- ActivationLayerInfo act_info)
+ unsigned int depth_multiplier, ActivationLayerInfo act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info));
@@ -225,8 +224,17 @@
_conv_pad_top = conv_info.pad_top();
_border_size = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left);
+ // Configure kernel window
+ std::string kernel_name;
+ const GPUTarget gpu_target = get_target();
+
+ auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, gpu_target, kernel_name);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
// Set build options
CLBuildOptions build_opts;
+ build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(_output->info()->tensor_shape().z()));
build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
@@ -263,25 +271,16 @@
const float s2 = output->info()->quantization_info().scale;
const int o2 = output->info()->quantization_info().offset;
+ build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
+ build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
if(o1 != o2 || s1 != s2)
{
- build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
build_opts.add_option("-DS2_VAL=" + float_to_string_with_full_precision(s2));
- build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
build_opts.add_option("-DO2_VAL=" + support::cpp11::to_string(o2));
}
}
}
}
-
- // Configure kernel window
- std::string kernel_name;
- const GPUTarget gpu_target = get_target();
-
- auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, gpu_target, kernel_name);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
// Set config_id for enabling LWS tuning
@@ -316,15 +315,17 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- // Create input window and adjust
- Window win_in = window;
- win_in.adjust(Window::DimX, -_conv_pad_left, true);
- win_in.adjust(Window::DimY, -_conv_pad_top, true);
- win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
- win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice_in = win_in.first_slice_window_3D();
- Window slice_out = window.first_slice_window_3D();
+ // Create input window and adjust
+ Window collapsed_in = collapsed;
+ collapsed_in.adjust(Window::DimX, -_conv_pad_left, true);
+ collapsed_in.adjust(Window::DimY, -_conv_pad_top, true);
+ collapsed_in.set_dimension_step(Window::DimX, collapsed_in.x().step() * _conv_stride_x);
+ collapsed_in.set_dimension_step(Window::DimY, collapsed_in.y().step() * _conv_stride_y);
+
+ Window slice_in = collapsed_in.first_slice_window_3D();
+ Window slice_out = collapsed.first_slice_window_3D();
Window slice_weights = window.first_slice_window_3D();
slice_weights.set_dimension_step(Window::DimX, 0);
slice_weights.set_dimension_step(Window::DimY, 0);
@@ -347,5 +348,5 @@
enqueue(queue, *this, slice_out, lws_hint());
}
- while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+ while(collapsed.slide_window_slice_3D(slice_out) && collapsed_in.slide_window_slice_3D(slice_in));
}
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index 50f17d5..1fce14f 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLKernel.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Error.h"
@@ -44,14 +45,15 @@
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
const ActivationLayerInfo &act_info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::F32 || ((act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
- && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC))),
- "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && ((input->data_type() != DataType::QASYMM8) || ((act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
+ && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC))),
+ "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported"); //COMPMID-1317 add fused activation for F32
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1); // COMPMID-1071 Add depth multiplier support for NHWC
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) != 3 || weights->dimension(2) != 3);
const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
@@ -96,7 +98,7 @@
const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
const unsigned int num_rows_processed_per_iteration = is_stride_1 ? 2 : 1;
- const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : 2;
+ const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : (8 / input->element_size());
const unsigned int num_rows_read_per_iteration = num_rows_processed_per_iteration + 2;
const unsigned int num_rows_written_per_iteration = std::ceil(num_rows_processed_per_iteration / static_cast<float>(conv_info.stride().first));
@@ -137,8 +139,7 @@
}
void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- ActivationLayerInfo act_info)
+ unsigned int depth_multiplier, ActivationLayerInfo act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -158,8 +159,9 @@
ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 2);
ARM_COMPUTE_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 1);
- const bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type());
- const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type());
+ const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
_input = input;
_output = output;
@@ -168,9 +170,16 @@
_conv_stride_y = conv_info.stride().second;
_num_rows_processed_per_iteration = is_stride_1 ? 2 : 1;
_num_planes_processed_per_iteration = is_stride_1 ? 2 : 1;
- _border_size = BorderSize(conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
- const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : 2;
+ // If QASYMM8 and the 8 bit dot product is available, force _num_planes_processed_per_iteration to 1
+ if(is_dot8_supported && is_qasymm)
+ {
+ _num_planes_processed_per_iteration = 1;
+ }
+
+ _border_size = BorderSize(is_qasymm && is_stride_1 ? 0 : conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
+
+ const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : (8 / input->info()->element_size());
CLBuildOptions build_opts;
build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
@@ -211,16 +220,20 @@
const float s2 = output->info()->quantization_info().scale;
const int o2 = output->info()->quantization_info().offset;
+ build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
+ build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
if(o1 != o2 || s1 != s2)
{
- build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
build_opts.add_option("-DS2_VAL=" + float_to_string_with_full_precision(s2));
- build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
build_opts.add_option("-DO2_VAL=" + support::cpp11::to_string(o2));
}
}
}
}
+ else
+ {
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
+ }
if(is_stride_1)
{
@@ -233,11 +246,12 @@
build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_stride_x));
build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
}
+ build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
+ "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)))));
// Create kernel
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
- std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported
- && is_stride_1 ) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : "");
+ std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported
+ && is_stride_1) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : "");
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
@@ -280,8 +294,12 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- Window win = window;
- win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)), 1));
+ // Collapse window
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ const size_t total_batches = _input->info()->tensor_shape().total_size_upper(3);
+
+ Window win = window_collapsed;
+ win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)) * total_batches, 1));
// Create input window and adjust
Window win_in = win;
@@ -290,10 +308,10 @@
ARM_COMPUTE_ERROR_ON((win_in.y().step() < window.y().step()) || (win_in.z().step() < window.z().step()));
- Window slice_in = win_in.first_slice_window_3D();
- Window slice_out = win.first_slice_window_3D();
+ Window slice_in = win_in.first_slice_window_4D();
+ Window slice_out = win.first_slice_window_4D();
- unsigned int idx = 3 * num_arguments_per_3D_tensor();
+ unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor();
if(_biases != nullptr)
{
@@ -310,11 +328,11 @@
do
{
unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice_out);
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
add_3D_tensor_argument(idx, _weights, slice_out);
enqueue(queue, *this, slice_out, lws_hint());
}
- while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+ while(win.slide_window_slice_4D(slice_out) && win_in.slide_window_slice_4D(slice_in));
}
diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp
index f51628f..af7df14 100644
--- a/src/core/CL/kernels/CLDerivativeKernel.cpp
+++ b/src/core/CL/kernels/CLDerivativeKernel.cpp
@@ -96,10 +96,12 @@
AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
if(_run_derivative_x && _run_derivative_y)
{
+ // TODO(COMPMID-415) Fix x-access input bug in CL kernel instead of '+2'
input_access = AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration + 2, num_read_rows_per_iteration);
}
else if(_run_derivative_x)
{
+ // TODO(COMPMID-415) Fix x-access input bug in CL kernel instead of '+2'
input_access = AccessWindowHorizontal(input->info(), -border_size().left, num_elems_processed_per_iteration + 2);
}
else if(_run_derivative_y)
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index c8da7ac..471b320 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -93,8 +93,14 @@
inline bool can_run_optimized_kernel_for_bifrost(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
DataType data_type, DataLayout data_layout)
{
- return gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76) && (kernel_size <= 5)
- && (conv_stride_x == 1) && (conv_stride_y == 1) && (data_type == DataType::F32) && (data_layout == DataLayout::NCHW);
+ return gpu_target_is_in(gpu_target,
+ GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+ GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+ GPUTarget::G52, GPUTarget::G52LIT)
+ && (kernel_size <= 5)
+ && (conv_stride_x == 1) && (conv_stride_y == 1)
+ && (data_type == DataType::F32)
+ && (data_layout == DataLayout::NCHW);
}
inline void setup_num_elems(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y,
@@ -278,6 +284,7 @@
TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
// Output auto inizialitation if not yet initialized
+ // FIXME: input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
auto_init_if_empty(*output, output_shape,
1,
input->data_type(),
@@ -356,6 +363,7 @@
TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
// Output auto inizialitation if not yet initialized
+ // FIXME: input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
auto_init_if_empty(*output->info(),
output_shape,
1,
@@ -413,8 +421,7 @@
}
else
{
- bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type);
-
+ const bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type);
build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));
build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index baf6bb6..6920667 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -168,7 +168,8 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
- Window slice = window.first_slice_window_3D();
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
do
{
@@ -176,5 +177,5 @@
add_3D_tensor_argument(idx, _tensor, slice);
enqueue(queue, *this, slice, cl::NullRange);
}
- while(window.slide_window_slice_3D(slice));
+ while(collapsed.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLFlattenLayerKernel.cpp b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
index 1718914..5c38568 100644
--- a/src/core/CL/kernels/CLFlattenLayerKernel.cpp
+++ b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
@@ -90,19 +90,21 @@
_input = input;
_output = output;
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
- build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("flatten", build_opts.options()));
-
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), output->info());
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option_if(output->info()->num_dimensions() > 2, "-DDST_DIM1=" + support::cpp11::to_string(output->info()->dimension(1)));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("flatten", build_opts.options()));
+
// Set config_id for enabling LWS tuning
_config_id = "flatten";
_config_id += "_";
@@ -131,21 +133,15 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
- Window out_window;
- out_window.use_tensor_dimensions(_output->info()->tensor_shape());
+ Window collapsed_window = window.collapse(ICLKernel::window(), Window::DimZ);
- Window out_slice = out_window.first_slice_window_1D();
- Window in_slice = window.first_slice_window_3D();
+ Window output_window;
+ output_window.use_tensor_dimensions(_output->info()->tensor_shape());
// Run kernel
- do
- {
- // Set arguments
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, in_slice);
- add_1D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice, lws_hint());
- }
- while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, collapsed_window);
+ add_3D_tensor_argument(idx, _output, output_window);
+ enqueue(queue, *this, collapsed_window, lws_hint());
}
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFloorKernel.cpp b/src/core/CL/kernels/CLFloorKernel.cpp
index 20e3a3a..831173d 100644
--- a/src/core/CL/kernels/CLFloorKernel.cpp
+++ b/src/core/CL/kernels/CLFloorKernel.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
@@ -33,7 +34,42 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+ // Validate in case of configured output
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ auto_init_if_empty(*output, *input);
+
+ const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
CLFloorKernel::CLFloorKernel()
: _input(nullptr), _output(nullptr)
@@ -47,14 +83,13 @@
// Auto initialize output
auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ // Validate
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
_input = input;
_output = output;
- constexpr unsigned int num_elems_processed_per_iteration = 4;
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
// Create kernel
std::set<std::string> build_opts;
@@ -63,13 +98,17 @@
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("floor_layer", build_opts));
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->info()->valid_region());
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+}
- ICLKernel::configure_internal(win);
+Status CLFloorKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+ return Status{};
}
void CLFloorKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -89,3 +128,4 @@
}
while(collapsed.slide_window_slice_3D(slice));
}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
new file mode 100644
index 0000000..e14b8a3
--- /dev/null
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+ const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+ float epsilon)
+{
+ ARM_COMPUTE_UNUSED(epsilon);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(conv_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(conv_weights, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_var);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, bn_mean, bn_var);
+
+ unsigned int kernels_idx = get_data_layout_dimension_index(conv_weights->data_layout(), DataLayoutDimension::BATCHES);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_weights->dimension(kernels_idx) != bn_mean->dimension(0));
+
+ // Validate bias
+ if(conv_bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, conv_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, conv_bias);
+ }
+ // Validate beta
+ if(bn_beta != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, bn_beta);
+ }
+ // Validate gamma
+ if(bn_gamma != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, bn_gamma);
+ }
+
+ // Validate output weights
+ if(fused_weights != nullptr && fused_weights->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(conv_weights, fused_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(conv_weights, fused_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, fused_weights);
+ }
+ // Validate output bias
+ if(fused_bias != nullptr && fused_bias->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, fused_bias);
+ }
+
+ return Status{};
+}
+} // namespace
+
+CLFuseBatchNormalizationKernel::CLFuseBatchNormalizationKernel()
+ : _conv_weights(nullptr), _conv_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
+ _run_in_place_weights(false), _run_in_place_bias(false)
+{
+}
+
+void CLFuseBatchNormalizationKernel::configure(const ICLTensor *conv_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
+ ICLTensor *fused_weights, ICLTensor *fused_bias,
+ const ICLTensor *conv_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
+ float epsilon)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(conv_weights, bn_mean, bn_var);
+
+ _conv_weights = conv_weights;
+ _conv_bias = conv_bias;
+ _bn_mean = bn_mean;
+ _bn_var = bn_var;
+ _bn_beta = bn_beta;
+ _bn_gamma = bn_gamma;
+ _fused_weights = fused_weights;
+ _fused_bias = fused_bias;
+ _epsilon = epsilon;
+
+ _run_in_place_weights = (fused_weights == nullptr) || (fused_weights == conv_weights);
+ _run_in_place_bias = (fused_bias == nullptr) || (conv_bias != nullptr && fused_bias == conv_bias);
+
+ // Auto initialize outputs
+ if(_fused_weights != nullptr)
+ {
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*_fused_weights->info(), *_conv_weights->info()->clone());
+ fused_weights->info()->set_valid_region(conv_weights->info()->valid_region());
+ }
+ if(_fused_bias != nullptr)
+ {
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
+ _fused_bias->info()->set_valid_region(bn_mean->info()->valid_region());
+ }
+
+ // Validate arguments
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(conv_weights->info(), bn_mean->info(), bn_var->info(),
+ (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+ (fused_bias != nullptr) ? fused_bias->info() : nullptr,
+ (conv_bias != nullptr) ? conv_bias->info() : nullptr,
+ (bn_beta != nullptr) ? bn_beta->info() : nullptr,
+ (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
+ epsilon));
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration_x = 16 / conv_weights->info()->element_size();
+ const int output_width_x = conv_weights->info()->tensor_shape().x();
+ const bool multi_access_x = (output_width_x / num_elems_processed_per_iteration_x > 0);
+
+ Window win = calculate_max_window(*conv_weights->info());
+ if(multi_access_x)
+ {
+ win.set(Window::DimX, Window::Dimension(win.x().start(),
+ ceil_to_multiple(win.x().end(), num_elems_processed_per_iteration_x),
+ num_elems_processed_per_iteration_x));
+ }
+ ICLKernel::configure_internal(win);
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(conv_weights->info()->data_type()));
+ build_opts.add_option("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(conv_weights->info()->data_type()));
+ build_opts.add_option("-DNUM_CHANNELS=" + support::cpp11::to_string(conv_weights->info()->dimension(2)));
+ build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
+ build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
+ build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - num_elems_processed_per_iteration_x, 0)));
+ build_opts.add_option_if(_run_in_place_weights, "-DIN_PLACE_W");
+ build_opts.add_option_if(_run_in_place_bias, "-DIN_PLACE_B");
+ build_opts.add_option_if(conv_bias != nullptr, "-DHAS_BIAS");
+ build_opts.add_option_if(bn_beta == nullptr, "-DUSE_DEFAULT_BETA");
+ build_opts.add_option_if(bn_gamma == nullptr, "-DUSE_DEFAULT_GAMMA");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("fuse_batchnormalization_layer", build_opts.options()));
+}
+
+Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+ const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+ float epsilon)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon));
+ return Status{};
+}
+
+void CLFuseBatchNormalizationKernel::run(const arm_compute::Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ // Create window slice
+ Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
+ Window slice = collapsed_window.first_slice_window_4D();
+
+ Window vector_slice = window.first_slice_window_1D();
+ vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ // Add kernel arguments
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _conv_weights, slice);
+ add_1D_tensor_argument(idx, _bn_mean, vector_slice);
+ add_1D_tensor_argument(idx, _bn_var, vector_slice);
+ if(!_run_in_place_weights)
+ {
+ add_4D_tensor_argument(idx, _fused_weights, slice);
+ }
+ if(!_run_in_place_bias)
+ {
+ add_1D_tensor_argument(idx, _fused_bias, vector_slice);
+ }
+ if(_conv_bias != nullptr)
+ {
+ add_1D_tensor_argument(idx, _conv_bias, vector_slice);
+ }
+ if(_bn_beta != nullptr)
+ {
+ add_1D_tensor_argument(idx, _bn_beta, vector_slice);
+ }
+ if(_bn_gamma != nullptr)
+ {
+ add_1D_tensor_argument(idx, _bn_gamma, vector_slice);
+ }
+ enqueue(queue, *this, slice, lws_hint());
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index ae54e77..f333c1b 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -115,7 +115,7 @@
{
}
-void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height, bool reinterpret_input_as_3d)
+void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height, bool reinterpret_input_as_3d, bool unroll_block)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -132,6 +132,7 @@
// Create build options
CLBuildOptions build_opts;
build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
+ build_opts.add_option_if(unroll_block, "-DUNROLL_BLOCK");
build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1)));
build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(2)));
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index 9adf95f..b2fb3e0 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
@@ -57,19 +57,17 @@
Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D");
if(!is_interleaved_transposed)
{
ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- }
}
else
{
@@ -95,71 +93,127 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+ }
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- }
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
}
return Status{};
}
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, bool is_interleaved_transposed,
- ElementsProcessed &num_elements_processed)
+ const GEMMReshapeInfo &reshape_info, ElementsProcessed &num_elements_processed)
{
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+ bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
+ bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
Window win{};
+ Window win_out{};
bool window_changed = false;
+ // In case both input and output have to be reinterpreted as 3D tensors,
+ // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+ if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+ {
+ reinterpret_input_as_3d = false;
+ reinterpret_output_as_3d = false;
+ }
+
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)).set_data_type(DataType::S32));
+
+ TensorInfo tmp_info(*output);
+
+ if(reinterpret_output_as_3d)
+ {
+ // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(output->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
if(is_interleaved_transposed)
{
+ // reinterpret_input_as_3d is not supported if is_interleaved_transposed is set
+ ARM_COMPUTE_ERROR_ON(reshape_info.reinterpret_input_as_3d());
+
// Configure kernel window
num_elems_processed_per_iteration_x = 4;
num_elems_processed_per_iteration_y = 4;
- win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+ // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
+ const int m = reshape_info.m();
+ const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
+
+ win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
- AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
- AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ AccessWindowStatic input1_access(input1, 0, 0,
+ ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+ ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
+ AccessWindowStatic output_access(output, 0, 0,
+ ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+ output->dimension(1) + bottom_pad);
- window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+ window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
+ update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+ output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
}
else
{
// Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
- num_elems_processed_per_iteration_x = 4;
- num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 5);
+ // Note: if the dot product instruction is available, the 8x2 tile has to be used
+ num_elems_processed_per_iteration_x = is_dot8_supported ? 8 : 4;
+ num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), is_dot8_supported ? 2 : 4);
+
+ // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+ // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
+ const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2] : input0->tensor_shape()[1];
+ const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
// Configure window
- win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));
- AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
- AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1) + bottom_pad);
+ AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
+ AccessWindowStatic output_access(output, 0, 0,
+ ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+ output->dimension(1) + bottom_pad);
- window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+ window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
+ update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
Coordinates coord;
coord.set_num_dimensions(output->num_dimensions());
- output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
+ output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
}
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win;
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+ collapsed = win.collapse(win, dimension_to_collapse);
+
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return std::make_pair(err, collapsed);
}
} // namespace
CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr)
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
{
}
@@ -167,18 +221,25 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
- // Output tensor auto inizialitation if not yet initialized
- TensorShape tensor_shape{ input0->info()->tensor_shape() };
- tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0));
- tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1));
-
- auto_init_if_empty(*output->info(), tensor_shape, 1, DataType::S32, QuantizationInfo());
-
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
- _input0 = input0;
- _input1 = input1;
- _output = output;
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
+ _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
+
+ // In case both input and output have to be reinterpreted as 3D tensors,
+ // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+ if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+ {
+ _reinterpret_input_as_3d = false;
+ _reinterpret_output_as_3d = false;
+ }
+
+ // Check if we need to slide the matrix B
+ const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d ? _input0->info()->num_dimensions() - 1 : _input0->info()->num_dimensions();
+ _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
ElementsProcessed num_elements_processed{};
@@ -186,15 +247,21 @@
GPUTarget arch_target = get_arch_from_target(get_target());
// Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, num_elements_processed);
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, num_elements_processed);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
// Create build options
- CLBuildOptions build_opts;
std::string kernel_name(" ");
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+ build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
+ build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+
if(is_interleaved_transposed)
{
const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
@@ -205,6 +272,7 @@
// the correct step which is calculated as (16 * mult_transpose1xW_width) / 4)
build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
+ build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
build_opts.add_option("-DTRANSPOSE1XW_WIDTH_STEP=" + support::cpp11::to_string(4 * mult_transpose1xW_width));
build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
@@ -225,6 +293,8 @@
// Set config_id for enabling LWS tuning
_config_id = "gemmlowp_";
_config_id += (is_interleaved_transposed ? "reshaped_" : "");
+ _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+ _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
_config_id += lower_string(string_from_data_type(input0->info()->data_type()));
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(1));
@@ -242,6 +312,7 @@
input1->clone().get(),
output->clone().get(),
is_interleaved_transposed,
+ reshape_info,
num_elements_processed)
.first);
@@ -253,18 +324,40 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window slice = window.first_slice_window_2D();
+ if(_input1->info()->num_dimensions() < 3)
+ {
+ // The stride_z for matrix B must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+ }
+
+ Window slice = window.first_slice_window_3D();
Window slice_matrix_b = slice;
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1));
- slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+ slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ if(_reinterpret_input_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
+ const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
+
+ if(_reinterpret_output_as_3d)
+ {
+ // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+ const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+ const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+ }
do
{
Window slice_b = slice;
// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- if(_input1->info()->num_dimensions() < 3)
+ // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+ if(!_slide_matrix_b)
{
slice_b = slice_matrix_b;
}
@@ -273,7 +366,10 @@
add_2D_tensor_argument(idx, _input0, slice);
add_2D_tensor_argument(idx, _input1, slice_b);
add_2D_tensor_argument(idx, _output, slice);
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
enqueue(queue, *this, slice, lws_hint());
}
- while(window.slide_window_slice_2D(slice));
+ while(window.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
index aa954ab..d348f2c 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
@@ -46,11 +46,18 @@
namespace
{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
int32_t a_offset, int32_t b_offset)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+ }
+
// If a_offset == 0, vector_sum_col can be a nullptr
if(a_offset != 0)
{
@@ -62,16 +69,24 @@
if(b_offset != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+ // Check if input is a 3D reinterpretation
+ const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+ // Validate input
+ ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+ ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
TensorShape output_shape = mm_result->tensor_shape();
if(output_shape.num_dimensions() > 1)
{
+ const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
vector_sum_row_shape.collapse_from(1);
- output_shape.collapse_from(2);
+ output_shape.collapse_from(output_batch_idx);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2],
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
"mm_result tensor must have the same number of batches of output tensor");
if(a_offset != 0)
@@ -88,7 +103,7 @@
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias,
int32_t a_offset, int32_t b_offset)
{
constexpr unsigned int num_elems_processed_per_iteration = 4;
@@ -98,20 +113,23 @@
Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win,
- mm_result_access);
+ window_changed = window_changed || update_window_and_padding(win, mm_result_access);
if(a_offset != 0)
{
AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win,
- vector_sum_col_access);
+ window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
}
if(b_offset != 0)
{
AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
- window_changed = window_changed || update_window_and_padding(win,
- vector_sum_row_access);
+ window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access);
+ }
+
+ if(bias != nullptr)
+ {
+ AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
+ window_changed = window_changed || update_window_and_padding(win, bias_access);
}
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
@@ -120,22 +138,30 @@
} // namespace
CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
- : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr)
+ : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _bias(nullptr)
{
}
-void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
+ int32_t b_offset)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+ bias != nullptr ? bias->info() : nullptr,
a_offset, b_offset)); // NOLINT
_vector_sum_col = vector_sum_col;
_vector_sum_row = vector_sum_row;
_mm_result = mm_result;
+ _bias = bias;
+
+ // Check if input is a 3D reinterpretation
+ const bool reinterpret_as_3d = vector_sum_row != nullptr
+ && mm_result->info()->num_dimensions() > 1
+ && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
// Set the arguments to pass at compile time
CLBuildOptions build_opts;
@@ -149,20 +175,26 @@
// If b_offset == 0, vector_sum_row can be a nullptr
build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+ build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
+ build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
+ build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+ std::string kernel_name("gemmlowp_offset_contribution");
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
// Configure kernel window
auto win_config = validate_and_configure_window(mm_result->info(),
vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+ bias != nullptr ? bias->info() : nullptr,
a_offset, b_offset); // NOLINT
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
// Set config_id for enabling LWS tuning
- _config_id = "gemmlowp_offset_contribution_";
+ _config_id = kernel_name + "_";
_config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
_config_id += "_";
_config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
@@ -170,13 +202,14 @@
_config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
}
-Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
int32_t a_offset, int32_t b_offset)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+ bias != nullptr ? bias->clone().get() : nullptr,
a_offset, b_offset)
.first); // NOLINT
@@ -194,11 +227,17 @@
// Set window for vector_sum_col
Window win_vector_sum_col = slice;
win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
// Set window for vector_sum_row
Window win_vector_sum_row = slice;
win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Window biases_slice = slice;
+ biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+ biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
do
{
@@ -212,7 +251,11 @@
{
add_2D_tensor_argument(idx, _vector_sum_row, win_vector_sum_row);
}
- enqueue(queue, *this, slice);
+ if(_bias != nullptr)
+ {
+ add_1D_tensor_argument(idx, _bias, biases_slice);
+ }
+ enqueue(queue, *this, slice, lws_hint());
}
while(collapsed.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
new file mode 100644
index 0000000..83af0c6
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
+ int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias == nullptr && a_offset == 0 && b_offset == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0 || output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+ }
+
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if(a_offset != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+ }
+
+ // If b_offset == 0, vector_sum_row can be a nullptr
+ if(b_offset != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+ // Check if input is a 3D reinterpretation
+ const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+ // Validate input
+ ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+ ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+ TensorShape output_shape = mm_result->tensor_shape();
+ if(output_shape.num_dimensions() > 1)
+ {
+ const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+ TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+ vector_sum_row_shape.collapse_from(1);
+ output_shape.collapse_from(output_batch_idx);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
+ "mm_result tensor must have the same number of batches of output tensor");
+
+ if(a_offset != 0)
+ {
+ TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+ vector_sum_col_shape.collapse_from(1);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+ "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+ }
+ }
+ }
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias, ITensorInfo *output,
+ int32_t a_offset, int32_t b_offset)
+{
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+ bool window_changed = false;
+
+ // Auto initialize the output
+ auto_init_if_empty(*output, mm_result->clone()->set_data_type(DataType::QASYMM8));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, mm_result_access);
+
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, output_access);
+
+ if(a_offset != 0)
+ {
+ AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
+ }
+ if(b_offset != 0)
+ {
+ AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
+ window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access);
+ }
+
+ if(bias != nullptr)
+ {
+ AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
+ window_changed = window_changed || update_window_and_padding(win, bias_access);
+ }
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLGEMMLowpOffsetContributionOutputStageKernel::CLGEMMLowpOffsetContributionOutputStageKernel()
+ : _mm_result(nullptr), _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
+ int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
+ vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
+ vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+ bias != nullptr ? bias->info() : nullptr,
+ output->info(),
+ a_offset, b_offset, output_stage)); // NOLINT
+
+ const int min = output_stage.gemmlowp_min_bound;
+ const int max = output_stage.gemmlowp_max_bound;
+
+ _vector_sum_col = vector_sum_col;
+ _vector_sum_row = vector_sum_row;
+ _mm_result = mm_result;
+ _bias = bias;
+ _output = output;
+
+ // Check if input is a 3D reinterpretation
+ const bool reinterpret_as_3d = vector_sum_row != nullptr
+ && mm_result->info()->num_dimensions() > 1
+ && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+
+ // Set the arguments to pass at compile time
+ CLBuildOptions build_opts;
+
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if(a_offset != 0)
+ {
+ build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+ build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+ }
+ // If b_offset == 0, vector_sum_row can be a nullptr
+ build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+ build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+ build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
+ build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
+ build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+ build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
+ build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multiplier));
+ build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shift));
+ build_opts.add_option_if((min != 0) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+ build_opts.add_option_if((max != 255) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+
+ std::string kernel_name("gemmlowp_offset_contribution");
+
+ // Fuse output stage
+ if(output_stage.type != GEMMLowpOutputStageType::NONE)
+ {
+ kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("GEMMLowpOutputStage can not be NONE!");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(mm_result->info(),
+ vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
+ vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+ bias != nullptr ? bias->info() : nullptr,
+ output->info(),
+ a_offset, b_offset); // NOLINT
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name + "_";
+ _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
+}
+
+Status CLGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
+ const ITensorInfo *output,
+ int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
+ vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+ vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+ bias != nullptr ? bias->clone().get() : nullptr,
+ output->clone().get(),
+ a_offset, b_offset)
+ .first); // NOLINT
+
+ return Status{};
+}
+
+void CLGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ // Set window for vector_sum_col
+ Window win_vector_sum_col = slice;
+ win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ // Set window for vector_sum_row
+ Window win_vector_sum_row = slice;
+ win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Window biases_slice = slice;
+ biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+ biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _mm_result, slice);
+ if(_vector_sum_col != nullptr)
+ {
+ add_2D_tensor_argument(idx, _vector_sum_col, win_vector_sum_col);
+ }
+ if(_vector_sum_row != nullptr)
+ {
+ add_2D_tensor_argument(idx, _vector_sum_row, win_vector_sum_row);
+ }
+ if(_bias != nullptr)
+ {
+ add_1D_tensor_argument(idx, _bias, biases_slice);
+ }
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ while(collapsed.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 875e26d..b7eff0f 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -27,9 +27,12 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "support/ToolchainSupport.h"
using namespace arm_compute;
@@ -38,7 +41,8 @@
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+ int min, int max)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON(max > 255);
@@ -63,10 +67,13 @@
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
{
- constexpr unsigned int num_elems_processed_per_iteration = 16;
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
// Configure kernel window
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
@@ -75,8 +82,9 @@
if(output->total_size() != 0)
{
+ Window win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, output_result_access);
+ window_changed = window_changed || update_window_and_padding(win_out, output_result_access);
output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
}
@@ -100,7 +108,8 @@
{
}
-Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+ int min, int max)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
@@ -112,20 +121,14 @@
return Status{};
}
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
- int result_offset_after_shift, int min, int max)
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
+ int min, int max)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8));
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
- (bias != nullptr) ? bias->info() : nullptr,
- output->info(),
- min,
- max));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(),
+ min, max));
_input = input;
_bias = bias;
@@ -154,9 +157,11 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+ // Create input window
Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
Window slice = collapsed.first_slice_window_3D();
+ // Setup bias slice
unsigned int idx1 = num_arguments_per_3D_tensor();
if(_bias != nullptr)
{
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp
new file mode 100644
index 0000000..b7730d5
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+ int min, int max)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(max > 255);
+ ARM_COMPUTE_RETURN_ERROR_ON(min < 0 || min > max);
+
+ // Check biases if exist
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+ }
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+{
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win,
+ input_access);
+
+ if(output->total_size() != 0)
+ {
+ Window win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win_out, output_result_access);
+
+ output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ }
+
+ if(bias != nullptr)
+ {
+ AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
+ window_changed = window_changed || update_window_and_padding(win, bias_access);
+ }
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+class Coordinates;
+} // namespace arm_compute
+
+CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel()
+ : _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+ (bias != nullptr) ? bias->clone().get() : nullptr,
+ output->clone().get())
+ .first);
+
+ return Status{};
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+ float multiplier, int offset,
+ int min, int max)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));
+
+ _input = input;
+ _bias = bias;
+ _output = output;
+
+ // Set the arguments to pass at compile time
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(multiplier));
+ build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(offset));
+ build_opts.add_option_if((min != 0) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+ build_opts.add_option_if((max != 255) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+ build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_output_stage_quantize_down_float", build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ // Create input window
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ // Setup bias slice
+ unsigned int idx1 = num_arguments_per_3D_tensor();
+ if(_bias != nullptr)
+ {
+ Window biases_slice(slice);
+ biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+ biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ add_1D_tensor_argument(idx1, _bias, biases_slice);
+ }
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx1, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(collapsed.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
index 5789113..621bd2b 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
@@ -63,7 +63,7 @@
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
{
- constexpr unsigned int num_elems_processed_per_iteration = 16;
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
// Configure kernel window
Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
index cd26cd1..225c358 100644
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -59,7 +60,7 @@
Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
+ AccessWindowStatic input_access(input, 0, 0, input->dimension(0), input->dimension(1));
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
bool window_changed = update_window_and_padding(win, input_access, output_access);
@@ -115,8 +116,12 @@
CLBuildOptions build_opts;
build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0)));
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+
+ std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : "");
+
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_a_reduction", build_opts.options()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
// Configure kernel window
auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info());
@@ -196,8 +201,8 @@
Window slice_out = collapsed.first_slice_window_2D();
Window slice_in = slice_out;
- slice_in.set(Window::DimY, Window::Dimension(0, 1, 1));
- slice_in.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
do
{
diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
index 2f1f1bf..93332de 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -52,7 +52,11 @@
unsigned int &num_elems_processed_per_iteration)
{
// Select the vector size to use (8 for Bifrost; 16 for Midgard).
- num_elems_processed_per_iteration = gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76) ? 8 : 16;
+ bool is_gpu_bifrost = gpu_target_is_in(gpu_target,
+ GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+ GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+ GPUTarget::G52, GPUTarget::G52LIT);
+ num_elems_processed_per_iteration = is_gpu_bifrost ? 8 : 16;
// Configure kernel window
Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
index 0c65bb4..825d7fb 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
@@ -60,7 +60,7 @@
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float beta)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_UNUSED(input, output, beta);
+ ARM_COMPUTE_UNUSED(beta);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 8530ed2..c9ed776 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -24,7 +24,6 @@
#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLValidate.h"
@@ -48,12 +47,14 @@
{
using ElementsProcessed = Steps;
-inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
+inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
+ bool fp_mixed_precision)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((fp_mixed_precision && (input0->data_type() != DataType::F16)), "Mixed precision floating point is supported only for F16 data");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
@@ -111,7 +112,7 @@
unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
- bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 1);
+ bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
// In case both input and output have to be reinterpreted as 3D tensors,
// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -217,18 +218,19 @@
{
}
-void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
+void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
+ bool fp_mixed_precision)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
// Perform validate step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, fp_mixed_precision));
_input0 = input0;
_input1 = input1;
_output = output;
_reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
- _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 1);
+ _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
// In case both input and output have to be reinterpreted as 3D tensors,
// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -290,6 +292,11 @@
else
{
kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
+ if(fp_mixed_precision && data_type == DataType::F16)
+ {
+ // currently wider accumulator is only supported for fp16 kernels.
+ kernel_name += "_acc32";
+ }
}
}
else // The input tensors have not been reshaped
@@ -305,6 +312,11 @@
if(input0->info()->num_dimensions() != 1)
{
kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
+ if(fp_mixed_precision && data_type == DataType::F16)
+ {
+ // currently wider accumulator is only supported for fp16 kernels.
+ kernel_name += "_acc32";
+ }
}
else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
{
@@ -332,6 +344,7 @@
// Set config_id for enabling LWS tuning
_config_id = "gemm_";
_config_id += (is_interleaved_transposed ? "reshaped_" : "");
+ _config_id += (fp_mixed_precision ? "fp_mixed_" : "");
_config_id += (_reinterpret_input_as_3d ? "3di_" : "");
_config_id += (_reinterpret_output_as_3d ? "3do_" : "");
_config_id += lower_string(string_from_data_type(input0->info()->data_type()));
@@ -348,12 +361,12 @@
}
Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
- const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target)
+ const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision)
{
// Note: num_elements_processed will be set in validate_and_configure_window()
ElementsProcessed num_elements_processed{};
ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info, fp_mixed_precision));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
input1->clone().get(),
output->clone().get(),
@@ -385,7 +398,7 @@
if(_reinterpret_input_as_3d)
{
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+ // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
_kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 5b29905..aa1b92a 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -24,7 +24,6 @@
#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLValidate.h"
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
new file mode 100644
index 0000000..5d100a4
--- /dev/null
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(anchors, all_anchors);
+ ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
+ if(all_anchors->total_size() > 0)
+ {
+ size_t feature_height = info.feat_height();
+ size_t feature_width = info.feat_width();
+ size_t num_anchors = anchors->dimension(1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(all_anchors, anchors);
+ ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
+ ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
+ }
+ return Status{};
+}
+} // namespace
+
+CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel()
+ : _anchors(nullptr), _all_anchors(nullptr)
+{
+}
+
+void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(anchors, all_anchors);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(anchors->info(), all_anchors->info(), info));
+
+ // Metadata
+ const size_t num_anchors = anchors->info()->dimension(1);
+ const DataType data_type = anchors->info()->data_type();
+ const float width = info.feat_width();
+ const float height = info.feat_height();
+
+ // Initialize the output if empty
+ const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
+ auto_init_if_empty(*all_anchors->info(), output_shape, 1, data_type);
+
+ // Set instance variables
+ _anchors = anchors;
+ _all_anchors = all_anchors;
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DWIDTH=" + float_to_string_with_full_precision(width));
+ build_opts.add_option("-DHEIGHT=" + float_to_string_with_full_precision(height));
+ build_opts.add_option("-DSTRIDE=" + float_to_string_with_full_precision(1.f / info.spatial_scale()));
+ build_opts.add_option("-DNUM_ANCHORS=" + support::cpp11::to_string(num_anchors));
+ build_opts.add_option("-DNUM_ROI_FIELDS=" + support::cpp11::to_string(info.values_per_roi()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("generate_proposals_compute_all_anchors", build_opts.options()));
+
+ // The tensor all_anchors can be interpreted as an array of structs (each structs has values_per_roi fields).
+ // This means we don't need to pad on the X dimension, as we know in advance how many fields
+ // compose the struct.
+ Window win = calculate_max_window(*all_anchors->info(), Steps(info.values_per_roi()));
+ ICLKernel::configure_internal(win);
+}
+
+Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
+ return Status{};
+}
+
+void CLComputeAllAnchorsKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ // Collapse everything on the first dimension
+ Window collapsed = window.collapse(ICLKernel::window(), Window::DimX);
+
+ // Set arguments
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _anchors, collapsed);
+ add_1D_tensor_argument(idx, _all_anchors, collapsed);
+
+ // Note that we don't need to loop over the slices, as we are launching exactly
+ // as many threads as all the anchors generated
+ enqueue(queue, *this, collapsed);
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp
index ee39c71..b56ad8d 100644
--- a/src/core/CL/kernels/CLHistogramKernel.cpp
+++ b/src/core/CL/kernels/CLHistogramKernel.cpp
@@ -115,6 +115,7 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+ // TODO (COMPMID-679): Add CLMemFill
_output->map(queue, true);
ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
memset(_output->buffer(), 0, _output->size());
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 0ba0d0e..54ef23f 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -109,7 +109,7 @@
const int yin_end = input->dimension(1);
const int xout_start = 0;
- const int xout_end = input->dimension(0) < num_elems_processed_per_iteration ? ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration) : output->dimension(0);
+ const int xout_end = input->dimension(0) < num_elems_processed_per_iteration ? output->dimension(0) + (num_elems_processed_per_iteration - input->dimension(0)) : output->dimension(0);
const int yout_start = 0;
const int yout_end = output->dimension(1);
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 54ed51e..97dd919 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -49,9 +49,8 @@
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
// Reduce shape on axis
@@ -62,9 +61,9 @@
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
- ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
}
return Status{};
@@ -110,11 +109,36 @@
build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("l2_normalize", build_opts));
+ std::string kernel_name;
+ unsigned int idx = 0;
+ switch(axis)
+ {
+ case 0:
+ kernel_name = "x";
+ idx = num_arguments_per_1D_tensor() * 3;
+ break;
+ case 1:
+ kernel_name = "y";
+ idx = num_arguments_per_2D_tensor() * 3;
+ break;
+ case 2:
+ kernel_name = "z";
+ idx = num_arguments_per_3D_tensor() * 3;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("l2_normalize_" + kernel_name, build_opts));
// Set epsilon argument
- unsigned int idx = num_arguments_per_1D_tensor() * 3;
- _kernel.setArg<cl_uint>(idx, _epsilon);
+ if(input->info()->data_type() == DataType::F32)
+ {
+ _kernel.setArg<cl_uint>(idx, _epsilon);
+ }
+ else
+ {
+ _kernel.setArg<cl_ushort>(idx, _epsilon);
+ }
// Configure kernel window
auto win_config = validate_and_configure_window(_input->info(), _output->info());
@@ -137,18 +161,58 @@
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
Window window_sum(window);
- window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
- Window in_slice = window.first_slice_window_1D();
- Window sum_slice = window_sum.first_slice_window_1D();
-
- do
+ switch(_axis)
{
- unsigned int idx = 0;
- add_1D_tensor_argument(idx, _input, in_slice);
- add_1D_tensor_argument(idx, _sum, sum_slice);
- add_1D_tensor_argument(idx, _output, in_slice);
- enqueue(queue, *this, in_slice);
+ case 0:
+ {
+ window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+ Window in_slice = window.first_slice_window_1D();
+ Window sum_slice = window_sum.first_slice_window_1D();
+ do
+ {
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, in_slice);
+ add_1D_tensor_argument(idx, _sum, sum_slice);
+ add_1D_tensor_argument(idx, _output, in_slice);
+ enqueue(queue, *this, in_slice);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+ }
+ break;
+ case 1:
+ {
+ window_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
+ Window in_slice = window.first_slice_window_2D();
+ Window sum_slice = window_sum.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, in_slice);
+ add_2D_tensor_argument(idx, _sum, sum_slice);
+ add_2D_tensor_argument(idx, _output, in_slice);
+ enqueue(queue, *this, in_slice);
+ }
+ while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+ }
+ break;
+ case 2:
+ {
+ window_sum.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ Window in_slice = window.first_slice_window_3D();
+ Window sum_slice = window_sum.first_slice_window_3D();
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_3D_tensor_argument(idx, _sum, sum_slice);
+ add_3D_tensor_argument(idx, _output, in_slice);
+ enqueue(queue, *this, in_slice);
+ }
+ while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
}
diff --git a/src/core/CL/kernels/CLMemsetKernel.cpp b/src/core/CL/kernels/CLMemsetKernel.cpp
new file mode 100644
index 0000000..ab53897
--- /dev/null
+++ b/src/core/CL/kernels/CLMemsetKernel.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+CLMemsetKernel::CLMemsetKernel()
+ : ICLKernel(), _tensor(nullptr)
+{
+}
+
+void CLMemsetKernel::configure(ICLTensor *tensor,
+ const PixelValue &constant_value)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+ _tensor = tensor;
+
+ const DataType data_type = tensor->info()->data_type();
+ const int vec_size_x = 16 / tensor->info()->element_size();
+ const int output_width_x = tensor->info()->tensor_shape().x();
+ const bool multi_access_x = (output_width_x / vec_size_x > 0);
+
+ // Create and update the window (if needed)
+ Window win = calculate_max_window(*tensor->info());
+ if(multi_access_x)
+ {
+ win.set(Window::DimX,
+ Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+ }
+ ICLKernel::configure_internal(win);
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
+ build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("memset", build_opts.options()));
+}
+
+Status CLMemsetKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value)
+{
+ ARM_COMPUTE_UNUSED(tensor);
+ ARM_COMPUTE_UNUSED(constant_value);
+ return Status{};
+}
+
+void CLMemsetKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ // Collapse all the batches on the third
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
+ Window slice = collapsed.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _tensor, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(collapsed.slide_window_slice_2D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index eb1ad68..67357da 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
+#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLValidate.h"
@@ -61,24 +62,32 @@
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*output, *input->clone());
- const unsigned int norm_idx = get_normalization_dimension_index(input->data_layout(), norm_info);
- const unsigned int norm_size = norm_info.norm_size();
- bool is_norm_accross_width = norm_idx == 0;
+ const unsigned int num_elems_processed_per_iteration = 4;
- const unsigned int border_width = is_norm_accross_width ? std::min(norm_size / 2, 3U) : 0;
+ const unsigned int norm_idx = get_normalization_dimension_index(input->data_layout(), norm_info);
+ const bool is_norm_accross_width = norm_idx == 0;
+
+ const unsigned int border_width = is_norm_accross_width ? num_elems_processed_per_iteration - 1 : 0;
const BorderSize border_size = BorderSize(0, border_width);
- const unsigned int num_elems_processed_per_iteration = 4;
- const unsigned int num_elems_read_per_iteration = is_norm_accross_width ? (num_elems_processed_per_iteration + 2 * (norm_size / 2)) : num_elems_processed_per_iteration;
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ bool window_changed = false;
// We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
- AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration);
+ // Reads can occur within the valid region of the input
+ if(is_norm_accross_width)
+ {
+ AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
+ }
+ else
+ {
+ AccessWindowHorizontal input_access(input, -border_size.left, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
+ }
+
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
-
+ window_changed = window_changed || update_window_and_padding(win, output_access);
output_access.set_valid_region(win, input->valid_region());
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
@@ -109,14 +118,15 @@
_input = input;
_output = output;
- const unsigned int norm_idx = get_normalization_dimension_index(input->info()->data_layout(), norm_info);
- _is_norm_across_width = norm_idx == 0;
- const unsigned int border_width = _is_norm_across_width ? std::min(norm_info.norm_size() / 2, 3U) : 0;
- _border_size = BorderSize(0, border_width);
-
const unsigned int num_elems_processed_per_iteration = 4;
const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
+ const DataLayout data_layout = input->info()->data_layout();
+ const unsigned int norm_idx = get_normalization_dimension_index(data_layout, norm_info);
+ _is_norm_across_width = norm_idx == 0;
+ const unsigned int border_width = _is_norm_across_width ? num_elems_processed_per_iteration - 1 : 0;
+ _border_size = BorderSize(0, border_width);
+
// Set build options
CLBuildOptions build_opts;
build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
@@ -127,6 +137,7 @@
build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
+ build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
// Create kernel
std::string kernel_name = _is_norm_across_width ? "normalization_layer_in_map" : "normalization_layer_cross_map";
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
new file mode 100644
index 0000000..a44507b
--- /dev/null
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors");
+
+ const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0));
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *mean, ITensorInfo *std)
+{
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, *input->clone());
+
+ const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+
+ if(input->data_layout() == DataLayout::NHWC)
+ {
+ AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal std_access(std, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, mean_access, std_access);
+ }
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLNormalizePlanarYUVLayerKernel::CLNormalizePlanarYUVLayerKernel()
+ : _input(nullptr), _output(nullptr), _mean(nullptr), _std(nullptr)
+{
+}
+
+void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mean->info(), std->info()));
+
+ _input = input;
+ _output = output;
+ _mean = mean;
+ _std = std;
+
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+ const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ const DataType dt = input->info()->data_type();
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+ build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ build_opts.add_option(("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(channel_idx))));
+
+ std::string kernel_name = "normalize_planar_yuv_layer_";
+ if(is_data_type_quantized(dt))
+ {
+ build_opts.add_option(("-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().offset)));
+ build_opts.add_option(("-DSCALE=" + support::cpp11::to_string(input->info()->quantization_info().scale)));
+ kernel_name += "q8_";
+ }
+
+ // Create kernel
+ kernel_name += lower_string(string_from_data_layout(input->info()->data_layout()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), mean->info(), std->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "normalize_planar_yuv_layer_";
+ _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(dt));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(2));
+}
+
+Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), mean->clone().get(), std->clone().get()).first);
+
+ return Status{};
+}
+
+void CLNormalizePlanarYUVLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ Window slice_in = collapsed.first_slice_window_1D();
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ unsigned int idx = 2 * num_arguments_per_3D_tensor();
+ add_1D_tensor_argument(idx, _mean, slice_in);
+ add_1D_tensor_argument(idx, _std, slice_in);
+
+ do
+ {
+ idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ while(collapsed.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index c6f0f4b..a9a2c5c 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp
@@ -93,17 +93,17 @@
build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
// Run [2, 0, 1] permute
- if(_perm[0] == 2 && _perm[1] == 0 && _perm[2] == 1)
+ if(_perm == PermutationVector{ 2U, 0U, 1U })
{
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_201", build_opts));
}
// Run [1, 2, 0] permute
- else if(_perm[0] == 1 && _perm[1] == 2 && _perm[2] == 0)
+ else if(_perm == PermutationVector{ 1U, 2U, 0U })
{
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_120", build_opts));
}
// Run [3, 2, 0, 1] permute
- else if(_perm[0] == 3 && _perm[1] == 2 && _perm[2] == 0 && _perm[3] == 1)
+ else if(_perm == PermutationVector{ 3U, 2U, 0U, 1U })
{
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_3201", build_opts));
}
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index 4ca2ef8..286b94e 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -51,9 +51,9 @@
ARM_COMPUTE_UNUSED(rounding_policy);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input2);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
@@ -64,7 +64,7 @@
if(output->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
"Output can only be U8 if both inputs are U8");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
@@ -168,27 +168,44 @@
data_type = "DATA_TYPE_INT";
}
+ const bool is_quantized = is_data_type_quantized_asymmetric(input1->info()->data_type());
+
// Construct kernel name
std::string kernel_name = "pixelwise_mul";
- kernel_name += (scale_int >= 0) ? "_int" : "_float";
+ if(!is_data_type_quantized(output->info()->data_type()))
+ {
+ kernel_name += (scale_int >= 0) ? "_int" : "_float";
+ }
// Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace((overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
- build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte");
- build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_RES=" + compute_type);
- build_opts.emplace("-D" + data_type);
+ CLBuildOptions build_opts;
+ if(is_quantized)
+ {
+ build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().offset));
+ build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().offset));
+ build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(output->info()->quantization_info().offset));
+ build_opts.add_option("-DSCALE_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().scale));
+ build_opts.add_option("-DSCALE_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().scale));
+ build_opts.add_option("-DSCALE_OUT=" + support::cpp11::to_string(output->info()->quantization_info().scale));
+ kernel_name += "_quantized";
+ }
+ else
+ {
+ build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type()), "-DWRAP", "-DSATURATE");
+ build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
+ build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+ build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+ build_opts.add_option("-DDATA_TYPE_RES=" + compute_type);
+ }
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
// Set scale argument
unsigned int idx = 3 * num_arguments_per_3D_tensor(); //Skip the inputs and output parameters
- if(scale_int >= 0)
+ if(scale_int >= 0 && !is_quantized)
{
_kernel.setArg(idx++, scale_int);
}
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index df13068..bd21ea0 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -257,6 +257,8 @@
build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+ build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1,
+ "-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
break;
@@ -315,12 +317,14 @@
unsigned int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ // Collapse window
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
switch(_input->info()->data_layout())
{
case DataLayout::NCHW:
{
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = window_collapsed.first_slice_window_3D();
+ Window slice = window_collapsed.first_slice_window_3D();
do
{
// Upsample input by pool size
@@ -343,21 +347,23 @@
}
case DataLayout::NHWC:
{
- Window slice = window.first_slice_window_3D();
+ const size_t total_batches = _output->info()->tensor_shape().total_size_upper(3);
- Window in_slice = window.first_slice_window_3D();
+ Window slice = window_collapsed.first_slice_window_4D();
+ Window in_slice = window_collapsed.first_slice_window_4D();
in_slice.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration));
in_slice.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
in_slice.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
+ in_slice.set(3, Window::Dimension(0, total_batches, 1));
do
{
// Set inputs
unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, in_slice);
- add_3D_tensor_argument(idx, _output, slice);
+ add_4D_tensor_argument(idx, _input, in_slice);
+ add_4D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice, lws_hint());
}
- while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(in_slice));
+ while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
break;
}
default:
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
new file mode 100644
index 0000000..63e745e
--- /dev/null
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, input2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+
+ // Check variances
+ const int var_size = info.variances().size();
+ if(var_size > 1)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
+ for(int i = 0; i < var_size; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
+ }
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
+
+ if(!info.max_sizes().empty())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+ }
+
+ for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+ }
+
+ if(output != nullptr && output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(input1->data_layout(), DataLayoutDimension::HEIGHT)) != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info, int num_priors)
+{
+ ARM_COMPUTE_UNUSED(input2);
+ // Output tensor auto initialization if not yet initialized
+ TensorShape output_shape = compute_prior_box_shape(*input1, info);
+ auto_init_if_empty(*output, output_shape, 1, input1->data_type());
+
+ Window win{};
+ bool window_changed = false;
+
+ switch(input1->data_layout())
+ {
+ case DataLayout::NCHW:
+ {
+ const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
+
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = update_window_and_padding(win, output_access);
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ win = calculate_max_window(*output, Steps());
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ };
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLPriorBoxLayerKernel::CLPriorBoxLayerKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr), _info(), _num_priors(), _min(), _max(), _aspect_ratios()
+{
+}
+
+void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+ _info = info;
+ _min = min;
+ _max = max;
+ _aspect_ratios = aspect_ratios;
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), info));
+
+ // Calculate number of aspect ratios
+ _num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
+
+ const DataLayout data_layout = input1->info()->data_layout();
+
+ const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ const int layer_width = input1->info()->dimension(width_idx);
+ const int layer_height = input1->info()->dimension(height_idx);
+
+ int img_width = info.img_size().x;
+ int img_height = info.img_size().y;
+ if(img_width == 0 || img_height == 0)
+ {
+ img_width = input2->info()->dimension(width_idx);
+ img_height = input2->info()->dimension(height_idx);
+ }
+
+ float step_x = info.steps()[0];
+ float step_y = info.steps()[0];
+ if(step_x == 0.f || step_y == 0.f)
+ {
+ step_x = static_cast<float>(img_width) / layer_width;
+ step_y = static_cast<float>(img_height) / layer_height;
+ }
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(img_width));
+ build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(img_height));
+ build_opts.add_option("-DLAYER_WIDTH=" + support::cpp11::to_string(layer_width));
+ build_opts.add_option("-DLAYER_HEIGHT=" + support::cpp11::to_string(layer_height));
+ build_opts.add_option("-DSTEP_X=" + support::cpp11::to_string(step_x));
+ build_opts.add_option("-DSTEP_Y=" + support::cpp11::to_string(step_y));
+ build_opts.add_option("-DNUM_PRIORS=" + support::cpp11::to_string(_num_priors));
+ build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(info.offset()));
+ build_opts.add_option_if(info.clip(), "-DIN_PLACE");
+
+ if(info.variances().size() > 1)
+ {
+ for(unsigned int i = 0; i < info.variances().size(); ++i)
+ {
+ build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(i)));
+ }
+ }
+ else
+ {
+ for(unsigned int i = 0; i < 4; ++i)
+ {
+ build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(0)));
+ }
+ }
+
+ unsigned int idx = 0;
+ // Create kernel
+ switch(data_layout)
+ {
+ case DataLayout::NCHW:
+ {
+ idx = num_arguments_per_2D_tensor();
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("prior_box_layer_nchw", build_opts.options()));
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ idx = num_arguments_per_3D_tensor();
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("prior_box_layer_nhwc", build_opts.options()));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+
+ _kernel.setArg(idx++, *_min);
+ _kernel.setArg(idx++, *_max);
+ _kernel.setArg(idx++, *_aspect_ratios);
+ _kernel.setArg<unsigned int>(idx++, info.min_sizes().size());
+ _kernel.setArg<unsigned int>(idx++, info.max_sizes().size());
+ _kernel.setArg<unsigned int>(idx++, info.aspect_ratios().size());
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info(), info, _num_priors);
+
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
+ const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info, num_priors)
+ .first);
+
+ return Status{};
+}
+
+void CLPriorBoxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ queue.enqueueWriteBuffer(*_min, CL_TRUE, 0, _info.min_sizes().size() * sizeof(float), _info.min_sizes().data());
+ queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), _info.aspect_ratios().data());
+ if(!_info.max_sizes().empty())
+ {
+ queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data());
+ }
+
+ switch(_input1->info()->data_layout())
+ {
+ case DataLayout::NCHW:
+ {
+ Window slice = window.first_slice_window_2D();
+ slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ Window slice = window.first_slice_window_3D();
+ slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 4 * _num_priors));
+ slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), 2));
+
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
new file mode 100644
index 0000000..325eeb2
--- /dev/null
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, rois);
+ ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
+ ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW);
+ ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Output auto inizialitation if not yet initialized
+ TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->dimension(2), rois->dimension(1));
+ auto_init_if_empty((*output), output_shape, 1, input->data_type());
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration = 1;
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input_access(input, input->valid_region().start(0), num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLROIAlignLayerKernel::CLROIAlignLayerKernel()
+ : _input(nullptr), _output(nullptr), _rois(nullptr), _pool_info(0, 0, 0.f)
+{
+}
+
+void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), rois->info(), output->info(), pool_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ _input = input;
+ _output = output;
+ _rois = rois;
+ _pool_info = pool_info;
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX)));
+ build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY)));
+ build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ)));
+ build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
+ build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
+ build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale()));
+ build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
+
+ // Create kernel
+ std::string kernel_name = "roi_align_layer";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
+ return Status{};
+}
+
+void CLROIAlignLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+ Window slice_rois = slice;
+ // Parallelize spatially and across the fourth dimension of the output tensor (also across ROITensor)
+ slice_rois.set_dimension_step(Window::DimX, _rois->info()->dimension(0));
+ slice.set(Window::DimZ, window[3]);
+
+ // Set arguments
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_2D_tensor_argument(idx, _rois, slice_rois);
+ add_3D_tensor_argument(idx, _output, slice);
+ add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
+ add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
+
+ enqueue(queue, *this, slice);
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index bf36ae2..ef46325 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -39,24 +39,22 @@
namespace
{
-// OpenCL kernel requires input width to be a power of 2.
+// OpenCL kernel requires input width to be a power of 2 for x-axis.
constexpr unsigned int border_val = 64;
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
{
- ARM_COMPUTE_UNUSED(op);
-
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
-
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+ ARM_COMPUTE_RETURN_ERROR_ON(op == ReductionOperation::MEAN_SUM && axis == 0 && width == 0 && input->data_type() != DataType::QASYMM8);
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
}
return Status{};
@@ -69,16 +67,44 @@
output_shape.set(axis, 1);
auto_init_if_empty(*output, output_shape, 1, input->data_type());
- const unsigned int num_elems_processed_per_iteration = 16;
+ const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ bool window_changed = false;
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- const unsigned int border_width = ((input->dimension(0) % border_val) != 0) ? border_val - input->dimension(0) % border_val : 0;
-
- AccessWindowStatic input_access(input, 0, 0, input->dimension(0) + border_width, 1);
- AccessWindowHorizontal output_access(output, 0, 1);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, output->valid_region());
+ switch(axis)
+ {
+ case 0:
+ {
+ if(is_data_type_quantized(input->data_type()))
+ {
+ AccessWindowHorizontal input_access(input, 0, input->dimension(0));
+ AccessWindowHorizontal output_access(output, 0, 1);
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ }
+ else
+ {
+ const unsigned int border_width = ((input->dimension(0) % border_val) != 0) ? border_val - input->dimension(0) % border_val : 0;
+ AccessWindowStatic input_access(input, 0, 0, input->dimension(0) + border_width, 1);
+ AccessWindowHorizontal output_access(output, 0, 1);
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ }
+ }
+ break;
+ case 1:
+ case 2:
+ case 3:
+ {
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
@@ -96,46 +122,86 @@
return _border_size;
}
-void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
-
- const unsigned int num_elems_processed_per_iteration = 16;
- const unsigned int width_leftover = input->info()->dimension(0) % border_val;
- const unsigned int border_width = (width_leftover != 0) ? border_val - width_leftover : 0;
- const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op, width));
_input = input;
_output = output;
_reduction_axis = axis;
_op = op;
- // Set the number of WG based on the input size. If input width is < 128
- // we can use fewer threads than 8.
- cl::NDRange lws_hint = cl::NDRange(std::min(8U, num_of_threads));
- _border_size = BorderSize(0, border_width, 0, 0);
-
// Set build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ CLBuildOptions build_opts;
+ std::string data_type_promoted = get_cl_type_from_data_type(input->info()->data_type());
+ if(is_data_type_quantized(input->info()->data_type()) && axis != 0)
+ {
+ data_type_promoted = "uint";
+ }
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
+ build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE=");
+ build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
switch(op)
{
case ReductionOperation::SUM_SQUARE:
- build_opts.emplace(("-DOPERATION=square_sum"));
+ build_opts.add_option(("-DOPERATION=square_sum"));
break;
case ReductionOperation::SUM:
- build_opts.emplace(("-DOPERATION=sum"));
+ case ReductionOperation::MEAN_SUM:
+ build_opts.add_option(("-DOPERATION=sum"));
break;
default:
ARM_COMPUTE_ERROR("Unsupported reduction operation");
}
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reduction_operation", build_opts));
+ cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
+ std::string kernel_axis_name;
+ switch(axis)
+ {
+ case 0:
+ {
+ if(!is_data_type_quantized(input->info()->data_type()))
+ {
+ build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width));
+ const unsigned int width_leftover = input->info()->dimension(0) % border_val;
+ const unsigned int border_width = (width_leftover != 0) ? border_val - width_leftover : 0;
+ const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16);
+ kernel_axis_name = "x";
+
+ // Set the number of WG based on the input size. If input width is < 128
+ // we can use fewer threads than 8.
+ lws_hint = cl::NDRange(std::min(8U, num_of_threads));
+ _border_size = BorderSize(0, border_width, 0, 0);
+ }
+ else
+ {
+ build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+ kernel_axis_name = "quantized_x";
+ }
+ }
+ break;
+ case 1:
+ build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+ kernel_axis_name = "y";
+ break;
+ case 2:
+ build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+ kernel_axis_name = "z";
+ break;
+ case 3:
+ build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option("-DBATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
+ kernel_axis_name = "w";
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reduction_operation_" + kernel_axis_name, build_opts.options()));
// Configure kernel window
auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
@@ -145,9 +211,9 @@
ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
}
-Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op, width));
ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
return Status{};
@@ -158,28 +224,113 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- // Set out window
- Window out_window(window);
- out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
- // Get first input and output slices
- Window in_slice = window.first_slice_window_2D();
- Window out_slice = out_window.first_slice_window_2D();
-
- // Reshape window
- const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0;
- in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
-
- // Set local sums buffer
- unsigned int local_sum_size = lws_hint()[0] * _input->info()->element_size();
- _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr);
-
- do
+ switch(_reduction_axis)
{
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, in_slice);
- add_2D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice, lws_hint());
+ case 0:
+ {
+ // We use parallel reduction only in non quantized types
+ if(!is_data_type_quantized(_input->info()->data_type()))
+ {
+ // Set out window
+ Window out_window(window);
+ out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ // Get first input and output slices
+ Window in_slice = window.first_slice_window_2D();
+ Window out_slice = out_window.first_slice_window_2D();
+
+ // Reshape window
+ const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0;
+ in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
+
+ // Set local sums buffer
+ unsigned int local_sum_size = lws_hint()[0] * _input->info()->element_size();
+ _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, in_slice);
+ add_2D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice, lws_hint());
+ }
+ while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ }
+ else
+ {
+ // Get first input and output slices
+ Window window_in{ window };
+ window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+
+ Window in_slice = window.first_slice_window_1D();
+ Window out_slice = window.first_slice_window_1D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, in_slice);
+ add_1D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice);
+ }
+ while(window_in.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
+ }
+ }
+ break;
+ case 1:
+ {
+ // Get first input and output slices
+ Window window_in{ window };
+ window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+ Window in_slice = window_in.first_slice_window_2D();
+ Window out_slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, in_slice);
+ add_2D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice);
+ }
+ while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ }
+ break;
+ case 2:
+ {
+ // Get first input and output slices
+ Window window_in{ window };
+ window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+ Window in_slice = window_in.first_slice_window_3D();
+ Window out_slice = window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_3D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice);
+ }
+ while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+ }
+ break;
+ case 3:
+ {
+ // Get first input and output slices
+ Window window_in{ window };
+ window_in.set(3, Window::Dimension(0, 1, 1));
+ Window in_slice = window_in.first_slice_window_4D();
+ Window out_slice = window.first_slice_window_4D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, in_slice);
+ add_4D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice);
+ }
+ while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
}
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
new file mode 100644
index 0000000..7891844
--- /dev/null
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <string>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
+ DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
+ const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+
+ // Validate output if initialized
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+CLReorgLayerKernel::CLReorgLayerKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t stride)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
+
+ _input = input;
+ _output = output;
+
+ std::string kernel_name = std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
+ const size_t idx_channel = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input->info()->dimension(idx_channel)));
+ build_opts.add_option("-DSTRIDE=" + support::cpp11::to_string(stride));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Configure window
+ // auto inizialize the output tensor if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
+
+ Window win = calculate_max_window(*output->info(), Steps());
+
+ // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ ICLKernel::configure_internal(win);
+
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += string_from_data_type(input->info()->data_type());
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(stride);
+}
+
+Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, int32_t stride)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
+
+ return Status{};
+}
+
+void CLReorgLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ while(window.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLReshapeLayerKernel.cpp b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
index c7efa9a..aa1339d 100644
--- a/src/core/CL/kernels/CLReshapeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
@@ -37,8 +37,28 @@
#include <string>
+/** [CLReshapeLayerKernel Kernel] **/
using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != output->tensor_shape().total_size());
+
+ return Status{};
+}
+
+} // namespace
+
CLReshapeLayerKernel::CLReshapeLayerKernel()
: _input(nullptr), _output(nullptr)
{
@@ -46,20 +66,12 @@
void CLReshapeLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
- DataType::U16, DataType::S16,
- DataType::U32, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size());
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
_input = input;
_output = output;
- constexpr unsigned int num_elems_processed_per_iteration = 1;
-
// Create kernel
std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()) };
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_layer", build_opts));
@@ -84,17 +96,20 @@
_kernel.setArg<cl_int2>(idx++, output_shape);
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*input->info());
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowStatic output_access(output->info(), 0, 0, output->info()->tensor_shape().x(), output->info()->tensor_shape().y());
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
+ // Set the output valid region
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
ICLKernel::configure_internal(win);
}
+Status CLReshapeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+
+ return Status{};
+}
+
void CLReshapeLayerKernel::run(const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -109,3 +124,4 @@
add_3D_tensor_argument(idx, _output, window_collapsed);
enqueue(queue, *this, slice);
}
+/** [CLReshapeLayerKernel Kernel] **/
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index d56d6f7..ce6c016 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -62,7 +62,7 @@
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(output == input);
@@ -170,6 +170,8 @@
float hr = 0.f;
std::tie(wr, hr) = calculate_scale_factors(*input->info(), *output->info());
+ const bool call_quantized_kernel = is_data_type_quantized_asymmetric(input->info()->data_type()) && policy == InterpolationPolicy::BILINEAR;
+
DataLayout data_layout = input->info()->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -200,11 +202,18 @@
build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right));
build_opts.add_option_if(border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
build_opts.add_option_if_else(sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+ if(call_quantized_kernel)
+ {
+ build_opts.add_option("-DSCALE=" + support::cpp11::to_string(input->info()->quantization_info().scale));
+ build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().offset));
+ }
std::string interpolation_name = string_from_interpolation_policy(policy);
std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
- std::string kernel_name = "scale_" + interpolation_name + "_" + lower_string(string_from_data_layout(data_layout));
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+ std::string kernel_name = "scale_" + interpolation_name;
+ kernel_name += call_quantized_kernel ? "_quantized_" : "_";
+ kernel_name += lower_string(string_from_data_layout(data_layout));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
unsigned int idx = data_layout == DataLayout::NHWC ? 2 * num_arguments_per_3D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
new file mode 100644
index 0000000..d488631
--- /dev/null
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *padddings, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, padddings, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+ // Validate output if initialized
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+ // Validate output if initialized
+ if(output->total_size() != 0)
+ {
+ const DataLayout data_layout = input->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const int idx_batch = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] < padding_left.x() + padding_right.y());
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] / block_shape_x != (output->tensor_shape()[idx_width] - padding_left.x() - padding_right.y()));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] / block_shape_y != (output->tensor_shape()[idx_height] - padding_left.x() - padding_right.y()));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+CLSpaceToBatchLayerKernel::CLSpaceToBatchLayerKernel()
+ : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr)
+{
+}
+
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+
+ _input = input;
+ _block_shape = block_shape;
+ _paddings = paddings;
+ _output = output;
+
+ const DataLayout data_layout = input->info()->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_batch = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
+ build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
+ build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+ ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+ ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+
+ _input = input;
+ _output = output;
+
+ const DataLayout data_layout = input->info()->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_batch = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
+ build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
+ build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
+ build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
+ build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
+ build_opts.add_option("-DPAD_LEFT_X=" + support::cpp11::to_string(padding_left.x()));
+ build_opts.add_option("-DPAD_RIGHT_X=" + support::cpp11::to_string(padding_right.x()));
+ build_opts.add_option("-DPAD_LEFT_Y=" + support::cpp11::to_string(padding_left.y()));
+ build_opts.add_option("-DPAD_RIGHT_Y=" + support::cpp11::to_string(padding_right.y()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+ ICLKernel::configure_internal(win);
+}
+
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
+ return Status{};
+}
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+ return Status{};
+}
+
+void CLSpaceToBatchLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice_out = window.first_slice_window_3D();
+
+ Window slice_in = window.first_slice_window_4D();
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ Window vector_slice = window.first_slice_window_1D();
+ vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ Window padding_slice = window.first_slice_window_2D();
+ padding_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+ padding_slice.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ int batch_id = 0;
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ if(_paddings != nullptr && _block_shape != nullptr)
+ {
+ add_2D_tensor_argument(idx, _paddings, padding_slice);
+ add_1D_tensor_argument(idx, _block_shape, vector_slice);
+ }
+ add_argument(idx, batch_id);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ ++batch_id;
+ }
+ while(window.slide_window_slice_3D(slice_out));
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
new file mode 100644
index 0000000..2d2ba10
--- /dev/null
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
+ DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
+ {
+ return i == 0;
+ }));
+
+ // Get expected output shape
+ const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
+ starts, ends, strides,
+ begin_mask, end_mask, shrink_axis_mask);
+ ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
+
+ // Checks output if configured
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape() != exp_output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ // Output tensor auto initialization if not yet initialized
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
+ starts, ends, strides,
+ begin_mask, end_mask, shrink_axis_mask);
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+
+ // Create window
+ const unsigned int num_elems_processed_per_iteration = 1;
+
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLStridedSliceKernel::CLStridedSliceKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+
+ _input = input;
+ _output = output;
+
+ const TensorShape &input_shape = input->info()->tensor_shape();
+
+ const Coordinates final_strides = arm_compute::helpers::tensor_transform::strided_slice_strides(input_shape, strides);
+ const Coordinates starts_abs = arm_compute::helpers::tensor_transform::strided_slice_absolute_start_coords(input_shape, starts, final_strides, begin_mask);
+ const Coordinates ends_abs = arm_compute::helpers::tensor_transform::strided_slice_absolute_end_coords(input_shape, starts_abs, ends, final_strides, end_mask, shrink_axis_mask);
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+ // Enable multiple elements processing along x if stride_x is 1 and output width greater than the access vector size
+ const int vec_size_x = 16 / input->info()->element_size();
+ const int output_width_x = output->info()->tensor_shape().x();
+ const bool multi_access_x = (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
+
+ // Update window if needed
+ if(multi_access_x)
+ {
+ Window &updated_window = std::get<1>(win_config);
+ updated_window.set(Window::DimX,
+ Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
+ }
+ ICLKernel::configure_internal(win_config.second);
+
+ // Create build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+ {
+ build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i]));
+ build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i]));
+ }
+ build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+ build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option_if_else(input_shape.num_dimensions() > 2,
+ "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()),
+ "-DSRC_DEPTH=1");
+ build_opts.add_option_if_else(_output->info()->num_dimensions() > 2,
+ "-DDST_DEPTH=" + support::cpp11::to_string(_output->info()->tensor_shape().z()),
+ "-DDST_DEPTH=1");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("strided_slice", build_opts.options()));
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "strided_slice";
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+ {
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(i));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(starts_abs[i]);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(ends_abs[i]);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(final_strides[i]);
+ }
+}
+
+Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
+ starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)
+ .first);
+
+ return Status{};
+}
+
+void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = window_collapsed.first_slice_window_4D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice);
+ add_4D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ while(window_collapsed.slide_window_slice_4D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index 94e15f3..ccf22ea 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/CLValidate.h"
@@ -86,8 +87,7 @@
if(output->total_size() != 0)
{
- AccessWindowStatic output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration), ceil_to_multiple(output->dimension(1),
- num_elems_processed_per_iteration));
+ AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
window_changed = window_changed || update_window_and_padding(win, output_access);
diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
new file mode 100644
index 0000000..ee3fa11
--- /dev/null
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+CLUpsampleLayerKernel::CLUpsampleLayerKernel()
+ : _input(nullptr), _output(nullptr), _info(), _num_elems_processed_per_iteration_input_x()
+{
+}
+
+Status CLUpsampleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info, const InterpolationPolicy upsampling_policy)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_UNUSED(upsampling_policy);
+
+ DataLayout data_layout = input->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_width) != info.x() * input->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_height) != info.y() * input->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.x() != 2 || info.y() != 2, "Only stride 2 is supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(upsampling_policy != InterpolationPolicy::NEAREST_NEIGHBOR, "Only nearest neighbor policy supported");
+
+ return Status{};
+}
+
+void CLUpsampleLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &info, const InterpolationPolicy upsampling_policy)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_UNUSED(upsampling_policy);
+
+ _input = input;
+ _output = output;
+ _info = info;
+ _num_elems_processed_per_iteration_input_x = 1;
+
+ const DataLayout data_layout = input->info()->data_layout();
+
+ TensorShape output_shape = misc::shape_calculator::compute_upsample_shape(*input->info(), info);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+ output->info()->set_data_layout(data_layout);
+
+ unsigned int num_elems_processed_per_iteration_x = 16;
+ const int output_width_x = output->info()->dimension(0);
+ const bool multi_access_x = ((output_width_x / num_elems_processed_per_iteration_x) > 0);
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(CLUpsampleLayerKernel::validate(input->info(), output->info(), info, upsampling_policy));
+
+ Window win{};
+
+ switch(data_layout)
+ {
+ case DataLayout::NCHW:
+ {
+ win = calculate_max_window(*output->info());
+ win.set(Window::DimY, Window::Dimension(win.y().start(), win.y().end(), info.y()));
+ if(multi_access_x)
+ {
+ _num_elems_processed_per_iteration_input_x = num_elems_processed_per_iteration_x / info.x();
+ win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), num_elems_processed_per_iteration_x), num_elems_processed_per_iteration_x));
+ }
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ win = calculate_max_window(*output->info());
+ win.set(Window::DimY, Window::Dimension(win.y().start(), win.y().end(), info.x()));
+ win.set(Window::DimZ, Window::Dimension(win.z().start(), win.z().end(), info.y()));
+ if(multi_access_x)
+ {
+ _num_elems_processed_per_iteration_input_x = num_elems_processed_per_iteration_x;
+ win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(),
+ num_elems_processed_per_iteration_x),
+ num_elems_processed_per_iteration_x));
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.add_option_if(multi_access_x, "-DVEC_SIZE_IN=" + support::cpp11::to_string(_num_elems_processed_per_iteration_input_x));
+ build_opts.add_option_if(multi_access_x, "-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
+ build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X_IN=" + support::cpp11::to_string(std::max<int>(_input->info()->dimension(0) - _num_elems_processed_per_iteration_input_x, 0)));
+ build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X_OUT=" + support::cpp11::to_string(std::max<int>(output_width_x - num_elems_processed_per_iteration_x, 0)));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("upsample_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
+
+ ICLKernel::configure_internal(win);
+}
+
+void CLUpsampleLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed_window = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice_out = collapsed_window.first_slice_window_3D();
+ Window slice_in = collapsed_window.first_slice_window_3D();
+
+ DataLayout data_layout = _input->info()->data_layout();
+ switch(data_layout)
+ {
+ case DataLayout::NCHW:
+ slice_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_input_x));
+ slice_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1));
+ break;
+ case DataLayout::NHWC:
+ slice_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1));
+ slice_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), 1));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ }
+ while(collapsed_window.slide_window_slice_3D(slice_out) && collapsed_window.slide_window_slice_3D(slice_in));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
new file mode 100644
index 0000000..b0d27cb
--- /dev/null
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+ // The window needs to be based on the output
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration, input1->dimension(1));
+ AccessWindowStatic input2_access(input2, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input2->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+ input2->dimension(1));
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
+
+ Window win_collapsed = win.collapse(win, Window::DimZ);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win_collapsed);
+}
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::F16, DataType::U32,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) > output->dimension(0));
+
+ for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(i) != output->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(input2->dimension(i) != output->dimension(i));
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(input1->num_dimensions() > 4);
+
+ return Status{};
+}
+} // namespace
+
+CLWidthConcatenate2TensorsKernel::CLWidthConcatenate2TensorsKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+Status CLWidthConcatenate2TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+ return Status{};
+}
+
+void CLWidthConcatenate2TensorsKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Add build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+ build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
+ build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width_x2", build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+ ICLKernel::configure_internal(std::get<1>(win_config));
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "concatenate_width_x2_";
+ _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input1->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input1->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input2->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input2->info()->dimension(1));
+}
+
+void CLWidthConcatenate2TensorsKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_4D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input1, slice);
+ add_4D_tensor_argument(idx, _input2, slice);
+ add_4D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, window, lws_hint());
+ }
+ while(window.slide_window_slice_4D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
new file mode 100644
index 0000000..75aef9c
--- /dev/null
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *input3, ITensorInfo *input4, ITensorInfo *output)
+{
+ // The window needs to be based on the output
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration, input1->dimension(1));
+ AccessWindowStatic input2_access(input2, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input2->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+ input2->dimension(1));
+ AccessWindowStatic input3_access(input3, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input3->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+ input3->dimension(1));
+ AccessWindowStatic input4_access(input4, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input4->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+ input4->dimension(1));
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, input1_access, input2_access, input3_access, input4_access, output_access);
+
+ Window win_collapsed = win.collapse(win, Window::DimZ);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win_collapsed);
+}
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::F16, DataType::U32,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, input3, input4, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) + input3->dimension(0) + input4->dimension(0) > output->dimension(0));
+
+ for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(i) != output->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(input2->dimension(i) != output->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(input3->dimension(i) != output->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(input4->dimension(i) != output->dimension(i));
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(input1->num_dimensions() > 4);
+
+ return Status{};
+}
+} // namespace
+
+CLWidthConcatenate4TensorsKernel::CLWidthConcatenate4TensorsKernel()
+ : _input1(nullptr), _input2(nullptr), _input3(nullptr), _input4(nullptr), _output(nullptr)
+{
+}
+
+Status CLWidthConcatenate4TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, input3, input4, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), input3->clone().get(), input4->clone().get(), output->clone().get()).first);
+ return Status{};
+}
+
+void CLWidthConcatenate4TensorsKernel::configure(const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), input3->info(), input4->info(), output->info()));
+
+ _input1 = input1;
+ _input2 = input2;
+ _input3 = input3;
+ _input4 = input4;
+ _output = output;
+
+ // Add build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+ build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
+ build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->info()->dimension(0)));
+ build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->info()->dimension(0)));
+ build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width_x4", build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input1->info(), input2->info(), input3->info(), input4->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+ ICLKernel::configure_internal(std::get<1>(win_config));
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "concatenate_width_x4_";
+ _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input1->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input1->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input2->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input2->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input3->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input3->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input4->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input4->info()->dimension(1));
+}
+
+void CLWidthConcatenate4TensorsKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_4D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input1, slice);
+ add_4D_tensor_argument(idx, _input2, slice);
+ add_4D_tensor_argument(idx, _input3, slice);
+ add_4D_tensor_argument(idx, _input4, slice);
+ add_4D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, window, lws_hint());
+ }
+ while(window.slide_window_slice_4D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
index e5ab8d2..c51c579 100644
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
@@ -53,8 +53,10 @@
AccessWindowHorizontal output_access(output, width_offset, num_elems_processed_per_iteration);
bool window_changed = update_window_and_padding(win, input_access, output_access);
+ Window win_collapsed = win.collapse(win, Window::DimZ);
+
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
+ return std::make_pair(err, win_collapsed);
}
Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
{
@@ -69,7 +71,7 @@
{
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
}
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
return Status{};
}
@@ -103,6 +105,7 @@
build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(_width_offset));
+ build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width", build_opts.options()));
@@ -119,14 +122,8 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window slice = window.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
- }
- while(window.slide_window_slice_3D(slice));
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, window);
+ add_4D_tensor_argument(idx, _output, window);
+ enqueue(queue, *this, window);
}
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
index 818638c..55cc465 100644
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
@@ -45,7 +46,8 @@
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
const Size2D kernel_size = winograd_info.kernel_size;
const Size2D output_tile_size = winograd_info.output_tile_size;
@@ -109,9 +111,9 @@
// Set build options
CLBuildOptions build_opts;
build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL");
build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_FILTER_TRANSFORM_VERTICAL");
-
const Size2D kernel_size = winograd_info.kernel_size;
const Size2D output_tile_size = winograd_info.output_tile_size;
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index c4e472a..1c31ceb 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
@@ -41,7 +42,8 @@
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
const PadStrideInfo conv_info = winograd_info.convolution_info;
const Size2D output_tile_size = winograd_info.output_tile_size;
@@ -114,6 +116,7 @@
const PadStrideInfo conv_info = winograd_info.convolution_info;
const Size2D output_tile_size = winograd_info.output_tile_size;
const Size2D kernel_size = winograd_info.kernel_size;
+ const DataLayout data_layout = input->info()->data_layout();
const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
@@ -122,7 +125,7 @@
const int num_elements_x = input->info()->dimension(idx_w) - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
const int num_elements_y = input->info()->dimension(idx_h) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
- if(input->info()->data_layout() == DataLayout::NCHW)
+ if(data_layout == DataLayout::NCHW)
{
// Check if we need to extend the right or bottom border
const unsigned int extra_border_right = ((num_elements_x % output_tile_size.width) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.width - 1);
@@ -152,6 +155,7 @@
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast<int>(output->info()->dimension(1)));
+ const size_t total_batches = input->info()->tensor_shape().total_size_upper(3);
CLBuildOptions build_opts;
build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
@@ -159,14 +163,19 @@
build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
-
- if(input->info()->data_layout() == DataLayout::NHWC)
+ if(data_layout == DataLayout::NHWC)
{
+ build_opts.add_option_if(total_batches > 1, "-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y));
build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
build_opts.add_option("-DSRC_DIM_2=" + support::cpp11::to_string(_input->info()->dimension(2)));
}
+ else
+ {
+ build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
+ }
// Create kernel
std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
@@ -175,7 +184,7 @@
const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
// Check optimized kernel if output_dims == 2x2
- if((tile_max_dim == 2) && (input->info()->data_layout() == DataLayout::NCHW))
+ if((tile_max_dim == 2) && (data_layout == DataLayout::NCHW))
{
_step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2;
}
@@ -183,7 +192,7 @@
// Append stepz and data layout
kernel_name += "_stepz";
kernel_name += support::cpp11::to_string(_step_z);
- kernel_name += "_" + lower_string(string_from_data_layout(input->info()->data_layout()));
+ kernel_name += "_" + lower_string(string_from_data_layout(data_layout));
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
@@ -220,17 +229,30 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- const size_t idx_w = get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ const DataLayout data_layout = _input->info()->data_layout();
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const size_t total_batches = window.shape().total_size_upper(3);
- Window slice = window.first_slice_window_3D();
+ // Collapse window
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+ Window slice = window_collapsed.first_slice_window_3D();
slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1));
slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1));
+ if(data_layout == DataLayout::NHWC)
+ {
+ slice.set(idx_h, Window::Dimension(0, _num_tiles_y * total_batches, 1));
+ }
ARM_COMPUTE_ERROR_ON(((slice[idx_c].end() - slice[idx_c].start()) % _step_z) != 0);
slice.set(idx_c, Window::Dimension(slice[idx_c].start(), slice[idx_c].end(), _step_z));
+ unsigned int idx = 2 * num_arguments_per_3D_tensor();
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
+
do
{
unsigned int idx = 0;
@@ -239,5 +261,5 @@
enqueue(queue, *this, slice, lws_hint());
}
- while(window.slide_window_slice_3D(slice));
+ while(window_collapsed.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
index fa42596..7f1afe0 100644
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
@@ -47,7 +48,8 @@
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != winograd_info.output_data_layout);
@@ -155,6 +157,7 @@
kernel_size,
output_tile_size,
conv_info);
+ const size_t total_batches = output->info()->tensor_shape().total_size_upper(3);
// Set build options
CLBuildOptions build_opts;
@@ -162,6 +165,8 @@
build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width));
build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
@@ -203,8 +208,11 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+ // Collapse window
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
// Get initial windows
- Window slice = window.first_slice_window_3D();
+ Window slice = window_collapsed.first_slice_window_4D();
slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
// Setup output slice
@@ -214,7 +222,7 @@
if(_bias != nullptr)
{
- unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
+ unsigned int idx1 = 2 * num_arguments_per_4D_tensor();
Window slice_biases;
slice_biases.use_tensor_dimensions(_bias->info()->tensor_shape());
add_1D_tensor_argument(idx1, _bias, slice_biases);
@@ -222,15 +230,15 @@
if(_output->info()->data_layout() == DataLayout::NHWC)
{
- unsigned int idx2 = 2 * num_arguments_per_3D_tensor() + ((_bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
+ unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((_bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
_kernel.setArg(idx2, static_cast<int>(_output->info()->total_size() - _output->info()->strides_in_bytes().y()));
}
do
{
unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice_out);
+ add_4D_tensor_argument(idx, _input, slice);
+ add_4D_tensor_argument(idx, _output, slice_out);
enqueue(queue, *this, slice, lws_hint());
}
while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
diff --git a/src/core/CL/kernels/CLYOLOLayerKernel.cpp b/src/core/CL/kernels/CLYOLOLayerKernel.cpp
new file mode 100644
index 0000000..7d9dbd4
--- /dev/null
+++ b/src/core/CL/kernels/CLYOLOLayerKernel.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+ ARM_COMPUTE_UNUSED(act_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
+ const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(num_classes <= 0);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) % (num_classes + 5)) != 0);
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ if(output != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, *input);
+ }
+
+ const bool is_nchw = input->data_layout() == DataLayout::NCHW;
+ const unsigned int num_elems_processed_per_iteration = is_nchw ? 16 / input->element_size() : 1;
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ bool window_changed = false;
+
+ if(output != nullptr)
+ {
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+ }
+ else
+ {
+ window_changed = update_window_and_padding(win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
+ }
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLYOLOLayerKernel::CLYOLOLayerKernel()
+ : _input(nullptr), _output(nullptr), _run_in_place(false)
+{
+}
+
+void CLYOLOLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ _run_in_place = (output == nullptr) || (output == input);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info, num_classes));
+
+ const bool is_nchw = input->info()->data_layout() == DataLayout::NCHW;
+ const unsigned int num_elems_processed_per_iteration = is_nchw ? 16 / input->info()->element_size() : 1;
+ const DataType dt = input->info()->data_type();
+ float a_const = act_info.a();
+ float b_const = act_info.b();
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(act_info.activation())));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
+ build_opts.add_option("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(dt));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(a_const));
+ build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(b_const));
+ build_opts.add_option("-DNUM_CLASSES=" + support::cpp11::to_string(num_classes));
+ build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
+
+ // Create kernel
+ std::string kernel_name = std::string("yolo_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Make sure _kernel is initialized before calling the parent's configure
+ _input = input;
+ _output = output;
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "yolo_layer_";
+ _config_id += lower_string(string_from_data_type(dt));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+}
+
+Status CLYOLOLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+ const bool run_in_place = (output == nullptr) || (output == input);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info, num_classes));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()).first);
+
+ return Status{};
+}
+
+void CLYOLOLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ if(!_run_in_place)
+ {
+ add_3D_tensor_argument(idx, _output, slice);
+ }
+ enqueue(queue, *this, slice, lws_hint());
+ }
+ while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CPP/ICPPSimpleKernel.cpp b/src/core/CPP/ICPPSimpleKernel.cpp
index 9d18a9c..01fb016 100644
--- a/src/core/CPP/ICPPSimpleKernel.cpp
+++ b/src/core/CPP/ICPPSimpleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,26 @@
#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/ITensor.h"
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int num_elems_processed_per_iteration,
+ bool border_undefined, const arm_compute::BorderSize &border_size)
+{
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration), border_undefined, border_size);
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->valid_region(), border_undefined, border_size);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
ICPPSimpleKernel::ICPPSimpleKernel()
: _input{ nullptr }, _output{ nullptr }
@@ -40,14 +59,16 @@
_output = output;
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win,
- AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
- output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size);
-
- ICPPKernel::configure(win);
+ auto win_config = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration, border_undefined, border_size);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICPPKernel::configure(win_config.second);
}
+
+Status ICPPSimpleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_elems_processed_per_iteration,
+ bool border_undefined, const arm_compute::BorderSize &border_size)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration, border_undefined, border_size).first);
+ return Status{};
+}
+
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
new file mode 100644
index 0000000..06a0551
--- /dev/null
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <algorithm>
+#include <cmath>
+
+namespace arm_compute
+{
+namespace
+{
+template <typename T>
+std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &scores_in, std::vector<int> inds, const BoxNMSLimitInfo &info, int class_id)
+{
+ std::vector<int> keep;
+ const int proposals_width = proposals->info()->dimension(1);
+
+ std::vector<T> x1(proposals_width);
+ std::vector<T> y1(proposals_width);
+ std::vector<T> x2(proposals_width);
+ std::vector<T> y2(proposals_width);
+ std::vector<T> areas(proposals_width);
+
+ for(int i = 0; i < proposals_width; ++i)
+ {
+ x1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
+ y1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
+ x2[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 2, i)));
+ y2[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 3, i)));
+ areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0);
+ }
+
+ // Note: Soft NMS scores have already been initialized with input scores
+
+ while(!inds.empty())
+ {
+ // Find proposal with max score among remaining proposals
+ int max_pos = 0;
+ for(unsigned int i = 1; i < inds.size(); ++i)
+ {
+ if(scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
+ {
+ max_pos = i;
+ }
+ }
+ int element = inds.at(max_pos);
+ keep.push_back(element);
+ std::swap(inds.at(0), inds.at(max_pos));
+
+ // Remove first element and compute IoU of the remaining boxes with identified max box
+ inds.erase(inds.begin());
+
+ std::vector<int> sorted_indices_temp;
+ for(auto idx : inds)
+ {
+ const auto xx1 = std::max(x1[idx], x1[element]);
+ const auto yy1 = std::max(y1[idx], y1[element]);
+ const auto xx2 = std::min(x2[idx], x2[element]);
+ const auto yy2 = std::min(y2[idx], y2[element]);
+
+ const auto w = std::max((xx2 - xx1 + 1.f), 0.f);
+ const auto h = std::max((yy2 - yy1 + 1.f), 0.f);
+ const auto inter = w * h;
+ const auto ovr = inter / (areas[element] + areas[idx] - inter);
+
+ // Update scores based on computed IoU, overlap threshold and NMS method
+ T weight;
+ switch(info.soft_nms_method())
+ {
+ case NMSType::LINEAR:
+ weight = (ovr > info.nms()) ? (1.f - ovr) : 1.f;
+ break;
+ case NMSType::GAUSSIAN: // Gaussian
+ weight = std::exp(-1.f * ovr * ovr / info.soft_nms_sigma());
+ break;
+ case NMSType::ORIGINAL: // Original NMS
+ weight = (ovr > info.nms()) ? 0.f : 1.f;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+
+ // Discard boxes with new scores below min threshold and update pending indices
+ scores_in[class_id][idx] *= weight;
+ if(scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
+ {
+ sorted_indices_temp.push_back(idx);
+ }
+ }
+ inds = sorted_indices_temp;
+ }
+
+ return keep;
+}
+
+template <typename T>
+std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> sorted_indices, const BoxNMSLimitInfo &info, int class_id)
+{
+ std::vector<int> keep;
+
+ const int proposals_width = proposals->info()->dimension(1);
+
+ std::vector<T> x1(proposals_width);
+ std::vector<T> y1(proposals_width);
+ std::vector<T> x2(proposals_width);
+ std::vector<T> y2(proposals_width);
+ std::vector<T> areas(proposals_width);
+
+ for(int i = 0; i < proposals_width; ++i)
+ {
+ x1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
+ y1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
+ x2[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 2, i)));
+ y2[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 3, i)));
+ areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0);
+ }
+
+ while(!sorted_indices.empty())
+ {
+ int i = sorted_indices.at(0);
+ keep.push_back(i);
+
+ std::vector<int> sorted_indices_temp = sorted_indices;
+ std::vector<int> new_indices;
+ sorted_indices_temp.erase(sorted_indices_temp.begin());
+
+ for(unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
+ {
+ const float xx1 = std::max(x1[sorted_indices_temp.at(j)], x1[i]);
+ const float yy1 = std::max(y1[sorted_indices_temp.at(j)], y1[i]);
+ const float xx2 = std::min(x2[sorted_indices_temp.at(j)], x2[i]);
+ const float yy2 = std::min(y2[sorted_indices_temp.at(j)], y2[i]);
+
+ const float w = std::max((xx2 - xx1 + 1.f), 0.f);
+ const float h = std::max((yy2 - yy1 + 1.f), 0.f);
+ const float inter = w * h;
+ const float ovr = inter / (areas[i] + areas[sorted_indices_temp.at(j)] - inter);
+ const float ctr_x = xx1 + (w / 2);
+ const float ctr_y = yy1 + (h / 2);
+
+ // If suppress_size is specified, filter the boxes based on their size and position
+ const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() && ctr_x < info.im_width() && ctr_y < info.im_height());
+ if(ovr <= info.nms() && keep_size)
+ {
+ new_indices.push_back(j);
+ }
+ }
+
+ const unsigned int new_indices_size = new_indices.size();
+ std::vector<int> new_sorted_indices(new_indices_size);
+ for(unsigned int i = 0; i < new_indices_size; ++i)
+ {
+ new_sorted_indices[i] = sorted_indices[new_indices[i] + 1];
+ }
+ sorted_indices = new_sorted_indices;
+ }
+
+ return keep;
+}
+} // namespace
+
+CPPBoxWithNonMaximaSuppressionLimitKernel::CPPBoxWithNonMaximaSuppressionLimitKernel()
+ : _scores_in(nullptr), _boxes_in(nullptr), _batch_splits_in(nullptr), _scores_out(nullptr), _boxes_out(nullptr), _classes(nullptr), _batch_splits_out(nullptr), _keeps(nullptr), _keeps_size(nullptr),
+ _info()
+{
+}
+
+bool CPPBoxWithNonMaximaSuppressionLimitKernel::is_parallelisable() const
+{
+ return false;
+}
+
+template <typename T>
+void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
+{
+ const int batch_size = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
+ const int num_classes = _scores_in->info()->dimension(0);
+ const int scores_count = _scores_in->info()->dimension(1);
+ std::vector<int> total_keep_per_batch(batch_size);
+ std::vector<std::vector<int>> keeps(num_classes);
+ int total_keep_count = 0;
+
+ std::vector<std::vector<T>> in_scores(num_classes, std::vector<T>(scores_count));
+ for(int i = 0; i < scores_count; ++i)
+ {
+ for(int j = 0; j < num_classes; ++j)
+ {
+ in_scores[j][i] = *reinterpret_cast<const T *>(_scores_in->ptr_to_element(Coordinates(j, i)));
+ }
+ }
+
+ int offset = 0;
+ int cur_start_idx = 0;
+ for(int b = 0; b < batch_size; ++b)
+ {
+ const int num_boxes = _batch_splits_in == nullptr ? 1 : static_cast<int>(*reinterpret_cast<T *>(_batch_splits_in->ptr_to_element(Coordinates(b))));
+ // Skip first class if there is more than 1 except if the number of classes is 1.
+ const int j_start = (num_classes == 1 ? 0 : 1);
+ for(int j = j_start; j < num_classes; ++j)
+ {
+ std::vector<T> cur_scores(scores_count);
+ std::vector<int> inds;
+ for(int i = 0; i < scores_count; ++i)
+ {
+ const T score = in_scores[j][i];
+ cur_scores[i] = score;
+
+ if(score > _info.score_thresh())
+ {
+ inds.push_back(i);
+ }
+ }
+ if(_info.soft_nms_enabled())
+ {
+ keeps[j] = SoftNMS(_boxes_in, in_scores, inds, _info, j);
+ }
+ else
+ {
+ std::sort(inds.data(), inds.data() + inds.size(),
+ [&cur_scores](int lhs, int rhs)
+ {
+ return cur_scores[lhs] > cur_scores[rhs];
+ });
+
+ keeps[j] = NonMaximaSuppression<T>(_boxes_in, inds, _info, j);
+ }
+ total_keep_count += keeps[j].size();
+ }
+
+ if(_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
+ {
+ // merge all scores (represented by indices) together and sort
+ auto get_all_scores_sorted = [&in_scores, &keeps, total_keep_count]()
+ {
+ std::vector<T> ret(total_keep_count);
+
+ int ret_idx = 0;
+ for(unsigned int i = 1; i < keeps.size(); ++i)
+ {
+ auto &cur_keep = keeps[i];
+ for(auto &ckv : cur_keep)
+ {
+ ret[ret_idx++] = in_scores[i][ckv];
+ }
+ }
+
+ std::sort(ret.data(), ret.data() + ret.size());
+
+ return ret;
+ };
+
+ auto all_scores_sorted = get_all_scores_sorted();
+ const T image_thresh = all_scores_sorted[all_scores_sorted.size() - _info.detections_per_im()];
+ for(int j = 1; j < num_classes; ++j)
+ {
+ auto &cur_keep = keeps[j];
+ std::vector<int> new_keeps_j;
+ for(auto &k : cur_keep)
+ {
+ if(in_scores[j][k] >= image_thresh)
+ {
+ new_keeps_j.push_back(k);
+ }
+ }
+ keeps[j] = new_keeps_j;
+ }
+ total_keep_count = _info.detections_per_im();
+ }
+
+ total_keep_per_batch[b] = total_keep_count;
+
+ // Write results
+ int cur_out_idx = 0;
+ for(int j = j_start; j < num_classes; ++j)
+ {
+ auto &cur_keep = keeps[j];
+ auto cur_out_scores = reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+ auto cur_out_classes = reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+ const int box_column = (cur_start_idx + cur_out_idx) * 4;
+
+ for(unsigned int k = 0; k < cur_keep.size(); ++k)
+ {
+ cur_out_scores[k] = in_scores[j][cur_keep[k]];
+ cur_out_classes[k] = static_cast<T>(j);
+ auto cur_out_box_row0 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
+ auto cur_out_box_row1 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
+ auto cur_out_box_row2 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
+ auto cur_out_box_row3 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
+ *cur_out_box_row0 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
+ *cur_out_box_row1 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
+ *cur_out_box_row2 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
+ *cur_out_box_row3 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
+ }
+
+ cur_out_idx += cur_keep.size();
+ }
+
+ if(_keeps != nullptr)
+ {
+ cur_out_idx = 0;
+ for(int j = 0; j < num_classes; ++j)
+ {
+ for(unsigned int i = 0; i < keeps[j].size(); ++i)
+ {
+ *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) = static_cast<T>(keeps[j].at(i));
+ }
+ *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) = keeps[j].size();
+ cur_out_idx += keeps[j].size();
+ }
+ }
+
+ offset += num_boxes;
+ cur_start_idx += total_keep_count;
+ }
+
+ if(_batch_splits_out != nullptr)
+ {
+ for(int b = 0; b < batch_size; ++b)
+ {
+ *reinterpret_cast<float *>(_batch_splits_out->ptr_to_element(Coordinates(b))) = total_keep_per_batch[b];
+ }
+ }
+}
+
+void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
+ ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::F16, DataType::F32);
+ const unsigned int num_classes = scores_in->info()->dimension(0);
+
+ ARM_COMPUTE_UNUSED(num_classes);
+ ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0), "First dimension of input boxes must be of size 4*num_classes");
+ ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1), "Input scores and input boxes must have the same number of rows");
+
+ ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != boxes_out->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON(boxes_out->info()->dimension(0) != 4);
+ if(keeps != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr, "keeps_size cannot be nullptr if keeps has to be provided as output");
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, keeps);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keeps_size, 1, DataType::U32);
+ ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != keeps->info()->dimension(0));
+ ARM_COMPUTE_ERROR_ON(num_classes != keeps_size->info()->dimension(0));
+ }
+ if(batch_splits_in != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_in);
+ }
+ if(batch_splits_out != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_out);
+ }
+
+ _scores_in = scores_in;
+ _boxes_in = boxes_in;
+ _batch_splits_in = batch_splits_in;
+ _scores_out = scores_out;
+ _boxes_out = boxes_out;
+ _classes = classes;
+ _batch_splits_out = batch_splits_out;
+ _keeps = keeps;
+ _keeps_size = keeps_size;
+ _info = info;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*scores_in->info(), Steps(scores_in->info()->dimension(0)));
+
+ IKernel::configure(win);
+}
+
+void CPPBoxWithNonMaximaSuppressionLimitKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_UNUSED(window);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
+
+ switch(_scores_in->info()->data_type())
+ {
+ case DataType::F32:
+ run_nmslimit<float>();
+ break;
+ case DataType::F16:
+ run_nmslimit<half>();
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+}
+} // namespace arm_compute
diff --git a/src/core/CPP/kernels/CPPFlipWeightsKernel.cpp b/src/core/CPP/kernels/CPPFlipWeightsKernel.cpp
new file mode 100644
index 0000000..2d4c0ce
--- /dev/null
+++ b/src/core/CPP/kernels/CPPFlipWeightsKernel.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+CPPFlipWeightsKernel::CPPFlipWeightsKernel()
+ : _input(nullptr), _output(nullptr), _func(nullptr)
+{
+}
+
+template <typename T>
+void CPPFlipWeightsKernel::flip_weights(const Window &window_input)
+{
+ // Create iterators
+ Iterator in(_input, window_input);
+
+ const DataLayout data_layout = _input->info()->data_layout();
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ const int kernel_width = _input->info()->dimension(idx_w);
+ const int kernel_height = _input->info()->dimension(idx_h);
+
+ execute_window_loop(window_input, [&](const Coordinates & id)
+ {
+ const unsigned int x = kernel_width - id[idx_w] - 1;
+ const unsigned int y = kernel_height - id[idx_h] - 1;
+ Coordinates output_coord(id);
+ output_coord.set(idx_w, x);
+ output_coord.set(idx_h, y);
+ *(reinterpret_cast<T *>(_output->ptr_to_element(output_coord))) = *(reinterpret_cast<const T *>(in.ptr()));
+ },
+ in);
+}
+
+void CPPFlipWeightsKernel::configure(const ITensor *input, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ _input = input;
+ _output = output;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+
+ // The CPPFlipWeightsKernel doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICPPKernel::configure(win);
+
+ switch(input->info()->data_type())
+ {
+ case DataType::F32:
+ _func = &CPPFlipWeightsKernel::flip_weights<float>;
+ break;
+ case DataType::F16:
+ _func = &CPPFlipWeightsKernel::flip_weights<half>;
+ break;
+ case DataType::QASYMM8:
+ _func = &CPPFlipWeightsKernel::flip_weights<uint8_t>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+}
+
+void CPPFlipWeightsKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+ (this->*_func)(window);
+}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
index 5e7609c..40b5a2b 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
@@ -675,6 +675,7 @@
{
tmp_out_offset += (dst_attrs.stride_x >> dst_shift);
+ // FIXME: need odd/even detection for tmp_out_offset?
mediump vec2 bias_vec = vec2(1.0f, 1.0f);
STORE_PACK2_HALF(dst_ptr, tmp_out_offset, bias_vec);
}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
index a65f980..e51908b 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
@@ -169,6 +169,7 @@
vec4 c20 = vec4(0.0f);
vec4 c30 = vec4(0.0f);
+ // FIXME: loop unrolling really needed for GLES?
for(; int(CURRENT_ITEM_OFFSET(src1_iter)) <= (end_row_mtx_b - 8); TENSOR_ITERATOR_ADVANCE(src0_iter, 8), TENSOR_ITERATOR_ADVANCE(src1_iter, 8))
{
/* Load values from matrix A (interleaved) and matrix B (transposed) */
@@ -1061,6 +1062,7 @@
c30[0] = vec4(0.0f);
c30[1] = vec4(0.0f);
+ // FIXME: loop unrolling really needed for GLES?
for(; (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) <= (end_row_mtx_b - 16); TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 16), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, 32))
{
/* Load values from matrix A (interleaved) and matrix B (transposed) */
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index 8b0d41f..ecff233 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
@@ -64,6 +64,8 @@
if(bias != nullptr)
{
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
+ // FIXME: Bug in framework, workaround it in tests currently.
+ //ARM_COMPUTE_ERROR_ON(bias->info()->dimension(0) != weights->info()->dimension(3));
ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
}
@@ -130,6 +132,7 @@
switch(input->info()->data_type())
{
case DataType::F16:
+ // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
#define PROCESS_4X_3Y_1Z
#if defined(PROCESS_8X_3Y_1Z)
@@ -177,6 +180,7 @@
break;
}
}
+ // FIXME: Just keep one in release
else
{
switch(input->info()->data_type())
@@ -188,6 +192,7 @@
break;
case DataType::F32:
+ // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
#define PROCESS_4X_1Y_1Z
#if defined(PROCESS_1X_1Y_1Z)
diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
index fac2902..03463b2 100644
--- a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
@@ -36,26 +36,75 @@
using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors");
+
+ const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0));
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *mean, ITensorInfo *std)
+{
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, *input->clone());
+
+ const unsigned int num_elems_processed_per_iteration = 4;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ const int mean_padding = ceil_to_multiple(mean->dimension(0), num_elems_processed_per_iteration) - mean->dimension(0);
+ const int std_padding = ceil_to_multiple(std->dimension(0), num_elems_processed_per_iteration) - std->dimension(0);
+ AccessWindowStatic mean_access(mean, 0, 0, mean->dimension(0) + mean_padding, mean->dimension(1));
+ AccessWindowStatic std_access(std, 0, 0, std->dimension(0) + std_padding, std->dimension(1));
+
+ const bool window_changed = update_window_and_padding(win, input_access, output_access, mean_access, std_access);
+ output_access.set_valid_region(win, input->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
GCNormalizePlanarYUVLayerKernel::GCNormalizePlanarYUVLayerKernel()
- : _input(nullptr), _output(nullptr), _mean(nullptr), _sd(nullptr)
+ : _input(nullptr), _output(nullptr), _mean(nullptr), _std(nullptr)
{
}
-void GCNormalizePlanarYUVLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *sd)
+void GCNormalizePlanarYUVLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *std)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, mean, sd);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, sd);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mean->info(), std->info()));
_input = input;
_output = output;
_mean = mean;
- _sd = sd;
-
- const unsigned int num_elems_processed_per_iteration = 4;
+ _std = std;
// Set build options
std::set<std::string> build_opts;
@@ -67,19 +116,17 @@
_kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("normalize_planar_yuv_layer", build_opts));
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ auto win_config = validate_and_configure_window(input->info(), output->info(), mean->info(), std->info());
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- const int mean_padding = ceil_to_multiple(mean->info()->dimension(0), num_elems_processed_per_iteration) - mean->info()->dimension(0);
- const int sd_padding = ceil_to_multiple(sd->info()->dimension(0), num_elems_processed_per_iteration) - sd->info()->dimension(0);
- AccessWindowStatic mean_access(mean->info(), 0, 0, mean->info()->dimension(0) + mean_padding, mean->info()->dimension(1));
- AccessWindowStatic sd_access(sd->info(), 0, 0, sd->info()->dimension(0) + sd_padding, sd->info()->dimension(1));
+ IGCKernel::configure(std::get<1>(win_config));
+}
- update_window_and_padding(win, input_access, output_access, mean_access, sd_access);
- output_access.set_valid_region(win, input->info()->valid_region());
-
- IGCKernel::configure(win);
+Status GCNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), mean->clone().get(), std->clone().get())));
+ return Status{};
}
void GCNormalizePlanarYUVLayerKernel::run(const Window &window)
@@ -100,7 +147,7 @@
unsigned int idx = 2 * num_arguments_per_3D_tensor();
add_1D_tensor_argument(idx, _mean, 3, slice_in);
- add_1D_tensor_argument(idx, _sd, 4, slice_in);
+ add_1D_tensor_argument(idx, _std, 4, slice_in);
slice_in = window.first_slice_window_3D();
diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp
index a14a9c9..78e2df1 100644
--- a/src/core/GPUTarget.cpp
+++ b/src/core/GPUTarget.cpp
@@ -51,6 +51,14 @@
{
return arm_compute::GPUTarget::G51LIT;
}
+ else if(version == "G52")
+ {
+ return arm_compute::GPUTarget::G52;
+ }
+ else if(version == "G52LIT")
+ {
+ return arm_compute::GPUTarget::G52LIT;
+ }
else if(version == "G76")
{
return arm_compute::GPUTarget::G76;
@@ -106,6 +114,8 @@
{ GPUTarget::G51, "g51" },
{ GPUTarget::G51BIG, "g51big" },
{ GPUTarget::G51LIT, "g51lit" },
+ { GPUTarget::G52, "g52" },
+ { GPUTarget::G52LIT, "g52lit" },
{ GPUTarget::G76, "g76" },
{ GPUTarget::TTRX, "ttrx" },
{ GPUTarget::TBOX, "tbox" }
diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
index c73f4e7..be65102 100644
--- a/src/core/IAccessWindow.cpp
+++ b/src/core/IAccessWindow.cpp
@@ -102,6 +102,16 @@
return false;
}
+ PaddingSize needed = get_needed_padding(window);
+ PaddingSize available = _info->padding();
+
+ if(needed.top <= available.top && needed.right <= available.right
+ && needed.bottom <= available.bottom
+ && needed.left <= available.left)
+ {
+ return false;
+ }
+
const TensorShape &shape = _info->tensor_shape();
const Strides &strides = _info->strides_in_bytes();
const size_t offset_first_element = _info->offset_first_element_in_bytes();
@@ -206,7 +216,12 @@
{
return false;
}
+ // Update strides in tensor info
+ return _info->extend_padding( get_needed_padding(window));
+}
+PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window)const
+{
ARM_COMPUTE_ERROR_ON(_scale_x == 0);
ARM_COMPUTE_ERROR_ON(_scale_y == 0);
@@ -223,6 +238,5 @@
padding.top = std::max(0, -min_y);
padding.bottom = std::max<int>(0, max_y - shape[1]);
- // Update strides in tensor info
- return _info->extend_padding(padding);
+ return padding;
}
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 7a92c6b..5ce79f1 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -184,7 +184,7 @@
Iterator output(_output, window);
static const float16x8_t CONST_0 = vdupq_n_f16(0.f);
- static const float16x4_t CONST_1_H = vdup_n_f16(1.f);
+ static const float16x8_t CONST_1_H = vdupq_n_f16(1.f);
static const float32x4_t CONST_1_F32 = vdupq_n_f32(1.f);
@@ -240,23 +240,11 @@
break;
case ActivationFunction::LOGISTIC:
{
- const float16x4x2_t in0 =
- {
- vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_low_f16(in.val[0]))))))),
- vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_high_f16(in.val[0]))))))),
- };
-
- const float16x4x2_t in1 =
- {
- vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_low_f16(in.val[1]))))))),
- vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_high_f16(in.val[1]))))))),
- };
-
tmp =
{
{
- vcombine_f16(in0.val[0], in0.val[1]),
- vcombine_f16(in1.val[0], in1.val[1]),
+ vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[0])))),
+ vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[1]))))
}
};
}
@@ -281,6 +269,7 @@
break;
case ActivationFunction::SOFT_RELU:
{
+ // TODO (COMPMID-1535) : Revisit FP16 approximations
const float16x4x2_t in0 =
{
vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_low_f16(in.val[0])))))),
@@ -322,6 +311,7 @@
break;
case ActivationFunction::TANH:
{
+ // TODO (COMPMID-1535) : Revisit FP16 approximations
const float16x8x2_t mul =
{
vmulq_f16(b, in.val[0]),
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index a6102b1..169554f 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -456,7 +456,7 @@
Status NEArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index 3c76548..ff8fb84 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -46,10 +46,12 @@
namespace
{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
- Iterator input1(in1, window);
- Iterator input2(in2, window);
+ Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+ Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
execute_window_loop(window, [&](const Coordinates & id)
@@ -64,8 +66,8 @@
void sub_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
- Iterator input1(in1, window);
- Iterator input2(in2, window);
+ Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+ Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
execute_window_loop(window, [&](const Coordinates & id)
@@ -80,8 +82,8 @@
void sub_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
- Iterator input1(in1, window);
- Iterator input2(in2, window);
+ Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+ Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
execute_window_loop(window, [&](const Coordinates & id)
@@ -104,8 +106,8 @@
void sub_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
- Iterator input1(in1, window);
- Iterator input2(in2, window);
+ Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+ Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
execute_window_loop(window, [&](const Coordinates & id)
@@ -144,8 +146,8 @@
void sub_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- Iterator input1(in1, window);
- Iterator input2(in2, window);
+ Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+ Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
execute_window_loop(window, [&](const Coordinates & id)
@@ -167,8 +169,8 @@
void sub_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
- Iterator input1(in1, window);
- Iterator input2(in2, window);
+ Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+ Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
execute_window_loop(window, [&](const Coordinates & id)
@@ -192,8 +194,8 @@
}
void sub_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
- Iterator input1(in1, window);
- Iterator input2(in2, window);
+ Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+ Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
execute_window_loop(window, [&](const Coordinates & id)
@@ -213,8 +215,8 @@
void sub_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
- Iterator input1(in1, window);
- Iterator input2(in2, window);
+ Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+ Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
execute_window_loop(window, [&](const Coordinates & id)
@@ -234,8 +236,8 @@
void sub_wrap_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
- Iterator input1(in1, window);
- Iterator input2(in2, window);
+ Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+ Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
execute_window_loop(window, [&](const Coordinates & id)
@@ -255,8 +257,8 @@
void sub_saturate_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
- Iterator input1(in1, window);
- Iterator input2(in2, window);
+ Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+ Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
execute_window_loop(window, [&](const Coordinates & id)
@@ -276,8 +278,8 @@
void sub_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
- Iterator input1(in1, window);
- Iterator input2(in2, window);
+ Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+ Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
execute_window_loop(window, [&](const Coordinates & id)
@@ -298,8 +300,8 @@
void sub_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
- Iterator input1(in1, window);
- Iterator input2(in2, window);
+ Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+ Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Iterator output(out, window);
execute_window_loop(window, [&](const Coordinates & id)
@@ -318,43 +320,71 @@
input1, input2, output);
}
-inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
{
ARM_COMPUTE_UNUSED(policy);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::U8)
- && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
- && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16)
- && !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
- && !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16)
- && !(input1->data_type() == DataType::F32 && input2->data_type() == DataType::F32 && output->data_type() == DataType::F32)
- && !(input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16 && output->data_type() == DataType::F16),
- "You called subtract with the wrong image formats");
+ const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+ // Validate in case of configured output
+ if(output.total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
+ && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
+ && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
+ && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
+ && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
+ && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
+ && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),
+ "You called subtract with the wrong image formats");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+ "Wrong shape for output");
+ }
return Status{};
}
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
{
- constexpr unsigned int num_elems_processed_per_iteration = 16;
+ const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
+ const TensorShape &out_shape = broadcast_pair.first;
+ const ValidRegion &valid_region = broadcast_pair.second;
- // Configure kernel window
- Window win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(output, out_shape);
- bool window_changed = update_window_and_padding(win,
- AccessWindowHorizontal(input1, 0, num_elems_processed_per_iteration),
- AccessWindowHorizontal(input2, 0, num_elems_processed_per_iteration),
- output_access);
+ if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
+ {
+ set_format_if_unknown(output, Format::S16);
+ }
+ else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
+ {
+ set_format_if_unknown(output, Format::F16);
+ }
+ else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
+ {
+ set_format_if_unknown(output, Format::F32);
+ }
+ }
- ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
- input2->valid_region());
+ Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+ Window win_input1 = win.broadcast_if_dimension_le_one(input1);
+ Window win_input2 = win.broadcast_if_dimension_le_one(input2);
+
+ AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win_input1, input1_access)
+ || update_window_and_padding(win_input2, input2_access)
+ || update_window_and_padding(win, output_access);
output_access.set_valid_region(win, valid_region);
@@ -371,26 +401,11 @@
void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
- // Auto initialize output if not initialized
- {
- set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
-
- if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
- {
- set_format_if_unknown(*output->info(), Format::S16);
- }
- else if(input1->info()->data_type() == DataType::F16 || input2->info()->data_type() == DataType::F16)
- {
- set_format_if_unknown(*output->info(), Format::F16);
- }
- else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output->info(), Format::F32);
- }
- }
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), policy));
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
static std::map<std::string, NEArithmeticSubtractionKernel::SubFunction *> map_function =
{
@@ -427,16 +442,15 @@
_func = it->second;
}
- // Configure kernel window
- auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
}
Status NEArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
return Status{};
}
@@ -450,3 +464,10 @@
(*_func)(_input1, _input2, _output, window);
}
+
+BorderSize NEArithmeticSubtractionKernel::border_size() const
+{
+ const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ return BorderSize(0, border, 0, 0);
+}
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index ac1fc39..683d48b 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -45,13 +45,11 @@
{
ARM_COMPUTE_UNUSED(epsilon);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16,
- DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
if(act_info.enabled())
{
ActivationLayerInfo::ActivationFunction act = act_info.activation();
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
&& act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
&& act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
@@ -102,16 +100,16 @@
}
} //namespace
-template <bool fused_activation>
+template <bool fused_activation, typename F>
void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw(const Window &window)
{
- static_assert(!fused_activation, "Activation is not supported for FP16");
-
ARM_COMPUTE_UNUSED(window);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Iterator input(_input, window);
Iterator output(_output, window);
+ F activation_functor(_act_info);
+
// Hold information about the current feature map we are iterating.
// Only compute denominator and NEON vectors once per feature map.
int slice = -1;
@@ -151,22 +149,30 @@
// Calculate x bar and store results
const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
const float16x8_t x_bar = vmulq_f16(numerator, denominator);
- vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec)));
+ float16x8_t res = vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec));
+
+ // Perform fused activation
+ if(fused_activation)
+ {
+ activation_functor(res);
+ }
+
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
},
input, output);
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
}
-template <bool fused_activation>
+template <bool fused_activation, typename F>
void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc(const Window &window)
{
- static_assert(!fused_activation, "Activation is not supported for FP16");
-
ARM_COMPUTE_UNUSED(window);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Iterator input(_input, window);
Iterator output(_output, window);
+ F activation_functor(_act_info);
+
const auto input_mean = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
const auto input_var = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
@@ -186,7 +192,15 @@
// Calculate x bar and store results
const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
const float16x8_t x_bar = vmulq_f16(numerator, denominator);
- vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec)));
+ float16x8_t res = vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec));
+
+ // Perform fused activation
+ if(fused_activation)
+ {
+ activation_functor(res);
+ }
+
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
},
input, output);
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -299,9 +313,12 @@
const bool is_nhwc = _input->info()->data_layout() == DataLayout::NHWC;
switch(_input->info()->data_type())
{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<false> : &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<false>;
+ _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<false, ::detail::dummy<float16_t, 8>> :
+ &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<false, ::detail::dummy<float16_t, 8>>;
break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
_func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<false, ::detail::dummy<float, 4>> :
&NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<false, ::detail::dummy<float, 4>>;
@@ -328,9 +345,30 @@
{ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::brelu<float, 4>> },
{ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::lubrelu<float, 4>> }
};
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ // NCHW Fused Batched Normalization with activation functions : FP16
+ static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw =
+ {
+ { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::relu<float16_t, 8>> },
+ { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::brelu<float16_t, 8>> },
+ { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::lubrelu<float16_t, 8>> }
+ };
+ // NHWC Fused Batched Normalization with activation functions : FP16
+ static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nhwc =
+ {
+ { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::relu<float16_t, 8>> },
+ { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::brelu<float16_t, 8>> },
+ { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::lubrelu<float16_t, 8>> }
+ };
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
switch(_input->info()->data_type())
{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ _func = (_input->info()->data_layout() == DataLayout::NHWC) ? bn_fused_map_f16_nhwc[_act_info.activation()] : bn_fused_map_f16_nchw[_act_info.activation()];
+ break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
_func = (_input->info()->data_layout() == DataLayout::NHWC) ? bn_fused_map_f32_nhwc[_act_info.activation()] : bn_fused_map_f32_nchw[_act_info.activation()];
break;
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index dc37452..fa51a7b 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
@@ -51,744 +51,6 @@
constexpr int MAYBE = 127;
} // namespace
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace fp16
-{
-inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
-{
- // Constant use for evaluating score1 and score3
- static const float32x4_t const45 = vdupq_n_f32(0.70710678118655f);
- static const float32x4_t zero = vdupq_n_f32(0.0f);
- static const float32x4_t one = vdupq_n_f32(1.0f);
- static const float32x4_t two = vdupq_n_f32(2.0f);
- static const float32x4_t three = vdupq_n_f32(3.0f);
-
- // Score0: (1, 0)
- const float32x4x2_t score0 =
- {
- vabsq_f32(gx.val[0]),
- vabsq_f32(gx.val[1])
- };
-
- // Score2: ( 0, 1 )
- const float32x4x2_t score2 =
- {
- vabsq_f32(gy.val[0]),
- vabsq_f32(gy.val[1])
- };
-
- // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
- float32x4x2_t score1 =
- {
- vmulq_f32(gy.val[0], const45),
- vmulq_f32(gy.val[1], const45)
- };
-
- float32x4x2_t score3 = score1;
-
- score1.val[0] = vmlaq_f32(score1.val[0], gx.val[0], const45);
- score1.val[1] = vmlaq_f32(score1.val[1], gx.val[1], const45);
- score3.val[0] = vmlsq_f32(score3.val[0], gx.val[0], const45);
- score3.val[1] = vmlsq_f32(score3.val[1], gx.val[1], const45);
-
- score1.val[0] = vabsq_f32(score1.val[0]);
- score1.val[1] = vabsq_f32(score1.val[1]);
- score3.val[0] = vabsq_f32(score3.val[0]);
- score3.val[1] = vabsq_f32(score3.val[1]);
-
- float32x4x2_t phase =
- {
- zero,
- zero
- };
-
- float32x4x2_t old_score = score0;
-
- // score1 > old_score?
- uint32x4x2_t mask =
- {
- vcgtq_f32(score1.val[0], old_score.val[0]),
- vcgtq_f32(score1.val[1], old_score.val[1])
- };
-
- phase.val[0] = vbslq_f32(mask.val[0], one, phase.val[0]);
- phase.val[1] = vbslq_f32(mask.val[1], one, phase.val[1]);
- old_score.val[0] = vbslq_f32(mask.val[0], score1.val[0], old_score.val[0]);
- old_score.val[1] = vbslq_f32(mask.val[1], score1.val[1], old_score.val[1]);
-
- // score2 > old_score?
- mask.val[0] = vcgtq_f32(score2.val[0], old_score.val[0]);
- mask.val[1] = vcgtq_f32(score2.val[1], old_score.val[1]);
-
- phase.val[0] = vbslq_f32(mask.val[0], two, phase.val[0]);
- phase.val[1] = vbslq_f32(mask.val[1], two, phase.val[1]);
- old_score.val[0] = vbslq_f32(mask.val[0], score2.val[0], old_score.val[0]);
- old_score.val[1] = vbslq_f32(mask.val[1], score2.val[1], old_score.val[1]);
-
- // score3 > old_score?
- mask.val[0] = vcgtq_f32(score3.val[0], old_score.val[0]);
- mask.val[1] = vcgtq_f32(score3.val[1], old_score.val[1]);
-
- phase.val[0] = vbslq_f32(mask.val[0], three, phase.val[0]);
- phase.val[1] = vbslq_f32(mask.val[1], three, phase.val[1]);
- old_score.val[0] = vbslq_f32(mask.val[0], score3.val[0], old_score.val[0]);
- old_score.val[1] = vbslq_f32(mask.val[1], score3.val[1], old_score.val[1]);
-
- // Convert from float32x4_t to uint8x8_t
- return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(phase.val[0])),
- vmovn_u32(vcvtq_u32_f32(phase.val[1]))));
-}
-
-inline uint8x8_t phase_quantization(float16x8_t gx, float16x8_t gy)
-{
- // Constant use for evaluating score1 and score3
- static const float16x8_t const45 = vdupq_n_f16(0.70710678118655f);
- static const float16x8_t zero = vdupq_n_f16(0.0f);
- static const float16x8_t one = vdupq_n_f16(1.0f);
- static const float16x8_t two = vdupq_n_f16(2.0f);
- static const float16x8_t three = vdupq_n_f16(3.0f);
-
- // Score0: (1, 0)
- const float16x8_t score0 = vabsq_f16(gx);
-
- // Score2: ( 0, 1 )
- const float16x8_t score2 = vabsq_f16(gy);
-
- // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
- float16x8_t score1 = vmulq_f16(gy, const45);
- float16x8_t score3 = score1;
-
- score1 = vfmaq_f16(score1, gx, const45);
- score3 = vfmsq_f16(score3, gx, const45);
-
- score1 = vabsq_f16(score1);
- score3 = vabsq_f16(score3);
-
- float16x8_t phase = zero;
- float16x8_t old_score = score0;
-
- // score1 > old_score?
- uint16x8_t mask = vcgtq_f16(score1, old_score);
-
- phase = vbslq_f16(mask, one, phase);
- old_score = vbslq_f16(mask, score1, old_score);
-
- // score2 > old_score?
- mask = vcgtq_f16(score2, old_score);
-
- phase = vbslq_f16(mask, two, phase);
- old_score = vbslq_f16(mask, score2, old_score);
-
- // score3 > old_score?
- mask = vcgtq_f16(score3, old_score);
-
- phase = vbslq_f16(mask, three, phase);
-
- // Convert from float16x8_t to uint8x8_t
- return vmovn_u16(vcvtq_u16_f16(phase));
-}
-
-/** Computes the gradient phase if gradient_size = 3 or 5. The output is quantized.
- * 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return quantized phase for 8 pixels
- */
-inline uint8x8_t phase_quantization_S16_S16(int16x8_t gx, int16x8_t gy)
-{
- return phase_quantization(vcvtq_f16_s16(gx), vcvtq_f16_s16(gy));
-}
-
-/** Computes the gradient phase if gradient_size = 7. The output is quantized.
- * 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return quantized phase for 8 pixels
- */
-inline uint8x8_t phase_quantization_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
- // Convert to float
- const float32x4x2_t gx_f32 =
- {
- vcvtq_f32_s32(gx.val[0]),
- vcvtq_f32_s32(gx.val[1])
- };
-
- const float32x4x2_t gy_f32 =
- {
- vcvtq_f32_s32(gy.val[0]),
- vcvtq_f32_s32(gy.val[1])
- };
-
- return phase_quantization(gx_f32, gy_f32);
-}
-
-/** Computes the magnitude using the L1-norm type if gradient_size = 3 or 5
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint16x8_t mag_l1_S16_S16(int16x8_t gx, int16x8_t gy)
-{
- return vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(gx)),
- vreinterpretq_u16_s16(vabsq_s16(gy)));
-}
-
-/** Computes the magnitude using the L1-norm type if gradient_size = 7
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint32x4x2_t mag_l1_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
- const uint32x4x2_t gx_abs =
- {
- vreinterpretq_u32_s32(vabsq_s32(gx.val[0])),
- vreinterpretq_u32_s32(vabsq_s32(gx.val[1]))
- };
-
- const uint32x4x2_t gy_abs =
- {
- vreinterpretq_u32_s32(vabsq_s32(gy.val[0])),
- vreinterpretq_u32_s32(vabsq_s32(gy.val[1]))
- };
-
- const uint32x4x2_t out =
- {
- vaddq_u32(gx_abs.val[0], gy_abs.val[0]),
- vaddq_u32(gx_abs.val[1], gy_abs.val[1])
- };
-
- return out;
-}
-
-inline float32x4x2_t mag_l2(const float32x4x2_t &gx, const float32x4x2_t &gy)
-{
- // x^2 ...
- float32x4x2_t mag =
- {
- vmulq_f32(gx.val[0], gx.val[0]),
- vmulq_f32(gx.val[1], gx.val[1])
- };
-
- // ... + y^2
- mag.val[0] = vmlaq_f32(mag.val[0], gy.val[0], gy.val[0]);
- mag.val[1] = vmlaq_f32(mag.val[1], gy.val[1], gy.val[1]);
-
- // sqrt(...)
- mag.val[0] = vmulq_f32(vrsqrteq_f32(mag.val[0]), mag.val[0]);
- mag.val[1] = vmulq_f32(vrsqrteq_f32(mag.val[1]), mag.val[1]);
-
- return mag;
-}
-
-inline float16x8_t mag_l2(float16x8_t gx, float16x8_t gy)
-{
- // x^2 ...
- float16x8_t mag = vmulq_f16(gx, gx);
-
- // ... + y^2
- mag = vfmaq_f16(mag, gy, gy);
-
- // sqrt(...)
- mag = vmulq_f16(vrsqrteq_f16(mag), mag);
-
- return mag;
-}
-
-/** Computes the magnitude using L2-norm if gradient_size = 3 or 5
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint16x8_t mag_l2_S16_S16(int16x8_t gx, int16x8_t gy)
-{
- /* Compute magnitude using L2 normalization */
- const float16x8_t gx2 = vcvtq_f16_s16(gx);
- const float16x8_t gy2 = vcvtq_f16_s16(gy);
- const float16x8_t mag = mag_l2(gx2, gy2);
-
- /* Store magnitude - Convert to uint16x8 */
- return vcvtq_u16_f16(mag);
-}
-
-/** Computes the magnitude using L2-norm if gradient_size = 7
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint32x4x2_t mag_l2_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
- // Compute magnitude using L2 normalization
- float32x4x2_t gx2 =
- {
- vcvtq_f32_s32(gx.val[0]),
- vcvtq_f32_s32(gx.val[1])
- };
-
- float32x4x2_t gy2 =
- {
- vcvtq_f32_s32(gy.val[0]),
- vcvtq_f32_s32(gy.val[1])
- };
-
- const float32x4x2_t mag = mag_l2(gx2, gy2);
- const uint32x4x2_t mag32 =
- {
- vcvtq_u32_f32(mag.val[0]),
- vcvtq_u32_f32(mag.val[1])
- };
-
- return mag32;
-}
-
-/** Gradient function used when the gradient size = 3 or 5 and when the norm_type = L1-norm
- *
- * @param[in] in1_ptr Pointer to source image. Gx image. Data type supported S16
- * @param[in] in2_ptr Pointer to source image. Gy image. Data type supported S16
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U16
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l1norm_S16_S16_U16_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
- const auto in1 = static_cast<const int16_t *__restrict>(in1_ptr);
- const auto in2 = static_cast<const int16_t *__restrict>(in2_ptr);
- const auto out1 = static_cast<uint16_t *__restrict>(out1_ptr);
- const auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
- const int16x8x4_t gx =
- {
- vld1q_s16(in1),
- vld1q_s16(in1 + 8),
- vld1q_s16(in1 + 16),
- vld1q_s16(in1 + 24)
- };
-
- const int16x8x4_t gy =
- {
- vld1q_s16(in2),
- vld1q_s16(in2 + 8),
- vld1q_s16(in2 + 16),
- vld1q_s16(in2 + 24)
- };
-
- // Compute and store phase
- vst1_u8(out2 + 0, phase_quantization_S16_S16(gx.val[0], gy.val[0]));
- vst1_u8(out2 + 8, phase_quantization_S16_S16(gx.val[1], gy.val[1]));
- vst1_u8(out2 + 16, phase_quantization_S16_S16(gx.val[2], gy.val[2]));
- vst1_u8(out2 + 24, phase_quantization_S16_S16(gx.val[3], gy.val[3]));
-
- // Compute ans store magnitude using L1 normalization
- vst1q_u16(out1 + 0, mag_l1_S16_S16(gx.val[0], gy.val[0]));
- vst1q_u16(out1 + 8, mag_l1_S16_S16(gx.val[1], gy.val[1]));
- vst1q_u16(out1 + 16, mag_l1_S16_S16(gx.val[2], gy.val[2]));
- vst1q_u16(out1 + 24, mag_l1_S16_S16(gx.val[3], gy.val[3]));
-}
-
-/** Gradient function used when the gradient size = 3 or 5 and when the norm_type = L2-norm
- *
- * @param[in] in1_ptr Pointer to source image. Gx image. Data type supported S16
- * @param[in] in2_ptr Pointer to source image. Gy image. Data type supported S16
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U16
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l2norm_S16_S16_U16_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
- const auto in1 = static_cast<const int16_t *__restrict>(in1_ptr);
- const auto in2 = static_cast<const int16_t *__restrict>(in2_ptr);
- const auto out1 = static_cast<uint16_t *__restrict>(out1_ptr);
- const auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
- const int16x8x4_t gx =
- {
- vld1q_s16(in1),
- vld1q_s16(in1 + 8),
- vld1q_s16(in1 + 16),
- vld1q_s16(in1 + 24)
- };
-
- const int16x8x4_t gy =
- {
- vld1q_s16(in2),
- vld1q_s16(in2 + 8),
- vld1q_s16(in2 + 16),
- vld1q_s16(in2 + 24)
- };
-
- // Compute and store phase
- vst1_u8(out2 + 0, phase_quantization_S16_S16(gx.val[0], gy.val[0]));
- vst1_u8(out2 + 8, phase_quantization_S16_S16(gx.val[1], gy.val[1]));
- vst1_u8(out2 + 16, phase_quantization_S16_S16(gx.val[2], gy.val[2]));
- vst1_u8(out2 + 24, phase_quantization_S16_S16(gx.val[3], gy.val[3]));
-
- // Compute and store magnitude using L2 normalization
- vst1q_u16(out1 + 0, mag_l2_S16_S16(gx.val[0], gy.val[0]));
- vst1q_u16(out1 + 8, mag_l2_S16_S16(gx.val[1], gy.val[1]));
- vst1q_u16(out1 + 16, mag_l2_S16_S16(gx.val[2], gy.val[2]));
- vst1q_u16(out1 + 24, mag_l2_S16_S16(gx.val[3], gy.val[3]));
-}
-
-/** Gradient function used when the gradient size = 7 and when the norm_type = L1-norm
- *
- * @param[in] in1_ptr Pointer to source image. Gx image. Data type supported S32
- * @param[in] in2_ptr Pointer to source image. Gy image. Data type supported S32
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U32
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l1norm_S32_S32_U32_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
- auto in1 = static_cast<const int32_t *__restrict>(in1_ptr);
- auto in2 = static_cast<const int32_t *__restrict>(in2_ptr);
- auto out1 = static_cast<uint32_t *__restrict>(out1_ptr);
- auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
- // Process low and high part
- for(size_t i = 0; i < 2; ++i, in1 += 16, in2 += 16, out1 += 16, out2 += 16)
- {
- const int32x4x2_t gx0 =
- {
- vld1q_s32(in1 + 0),
- vld1q_s32(in1 + 4)
- };
-
- const int32x4x2_t gx1 =
- {
- vld1q_s32(in1 + 8),
- vld1q_s32(in1 + 12)
- };
-
- const int32x4x2_t gy0 =
- {
- vld1q_s32(in2 + 0),
- vld1q_s32(in2 + 4)
- };
-
- const int32x4x2_t gy1 =
- {
- vld1q_s32(in2 + 8),
- vld1q_s32(in2 + 12)
- };
-
- // Compute and store phase
- vst1_u8(out2 + 0, phase_quantization_S32_S32(gx0, gy0));
- vst1_u8(out2 + 8, phase_quantization_S32_S32(gx1, gy1));
-
- // Compute magnitude using L1 normalization
- const uint32x4x2_t mag0 = mag_l1_S32_S32(gx0, gy0);
- const uint32x4x2_t mag1 = mag_l1_S32_S32(gx1, gy1);
-
- // Store magnitude
- vst1q_u32(out1 + 0, mag0.val[0]);
- vst1q_u32(out1 + 4, mag0.val[1]);
- vst1q_u32(out1 + 8, mag1.val[0]);
- vst1q_u32(out1 + 12, mag1.val[1]);
- }
-}
-
-/** Gradient function used when the gradient size = 7 and when the norm_type = L2-norm
- *
- * @param[in] in1_ptr Pointer to source image. Gx image. Data type supported S32
- * @param[in] in2_ptr Pointer to source image. Gy image. Data type supported S32
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U32
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l2norm_S32_S32_U32_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
- auto in1 = static_cast<const int32_t *__restrict>(in1_ptr);
- auto in2 = static_cast<const int32_t *__restrict>(in2_ptr);
- auto out1 = static_cast<uint32_t *__restrict>(out1_ptr);
- auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
- // Process low and high part
- for(size_t i = 0; i < 2; ++i, in1 += 16, in2 += 16, out1 += 16, out2 += 16)
- {
- const int32x4x2_t gx0 =
- {
- vld1q_s32(in1 + 0),
- vld1q_s32(in1 + 4)
- };
-
- const int32x4x2_t gx1 =
- {
- vld1q_s32(in1 + 8),
- vld1q_s32(in1 + 12)
- };
-
- const int32x4x2_t gy0 =
- {
- vld1q_s32(in2 + 0),
- vld1q_s32(in2 + 4)
- };
-
- const int32x4x2_t gy1 =
- {
- vld1q_s32(in2 + 8),
- vld1q_s32(in2 + 12)
- };
-
- // Compute and store phase
- vst1_u8(out2 + 0, phase_quantization_S32_S32(gx0, gy0));
- vst1_u8(out2 + 8, phase_quantization_S32_S32(gx1, gy1));
-
- // Compute magnitude using L2 normalization
- const uint32x4x2_t mag0 = mag_l2_S32_S32(gx0, gy0);
- const uint32x4x2_t mag1 = mag_l2_S32_S32(gx1, gy1);
-
- // Store magnitude
- vst1q_u32(out1 + 0, mag0.val[0]);
- vst1q_u32(out1 + 4, mag0.val[1]);
- vst1q_u32(out1 + 8, mag1.val[0]);
- vst1q_u32(out1 + 12, mag1.val[1]);
- }
-}
-
-inline uint16x4_t non_max_U32_helper(const uint32_t *in, const uint16x4_t pc, const uint32_t stride_mag, const int32_t lower_thr, const int32_t upper_thr)
-{
- // Phase for 4 pixel
- const uint32x4_t pc32 = vmovl_u16(pc);
-
- // Get magnitude for 4 pixel
- uint32x4_t mc = vld1q_u32(in);
-
- // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
- // 0 degree
- const uint32x4_t mk0_0 = vld1q_u32(in - 1);
- const uint32x4_t mk0_1 = vld1q_u32(in + 1);
- uint32x4_t mask0 = vceqq_u32(pc32, vdupq_n_u32(0));
- mask0 = vandq_u32(mask0, vcgtq_u32(mc, mk0_0));
- mask0 = vandq_u32(mask0, vcgtq_u32(mc, mk0_1));
-
- // 45 degree
- const uint32x4_t mk45_0 = vld1q_u32(in - stride_mag - 1);
- const uint32x4_t mk45_1 = vld1q_u32(in + stride_mag + 1);
- uint32x4_t mask1 = vceqq_u32(pc32, vdupq_n_u32(1));
- mask1 = vandq_u32(mask1, vcgtq_u32(mc, mk45_0));
- mask1 = vandq_u32(mask1, vcgtq_u32(mc, mk45_1));
-
- // 90 degree
- const uint32x4_t mk90_0 = vld1q_u32(in - stride_mag);
- const uint32x4_t mk90_1 = vld1q_u32(in + stride_mag);
- uint32x4_t mask2 = vceqq_u32(pc32, vdupq_n_u32(2));
- mask2 = vandq_u32(mask2, vcgtq_u32(mc, mk90_0));
- mask2 = vandq_u32(mask2, vcgtq_u32(mc, mk90_1));
-
- // 135 degree
- const uint32x4_t mk135_0 = vld1q_u32(in - stride_mag + 1);
- const uint32x4_t mk135_1 = vld1q_u32(in + stride_mag - 1);
- uint32x4_t mask3 = vceqq_u32(pc32, vdupq_n_u32(3));
- mask3 = vandq_u32(mask3, vcgtq_u32(mc, mk135_0));
- mask3 = vandq_u32(mask3, vcgtq_u32(mc, mk135_1));
-
- // Merge masks
- mask0 = vorrq_u32(mask0, mask1);
- mask2 = vorrq_u32(mask2, mask3);
- mask0 = vorrq_u32(mask0, mask2);
-
- mc = vbslq_u32(mask0, mc, vdupq_n_u32(0));
-
- // mc > upper_thr
- mask0 = vcgtq_u32(mc, vdupq_n_u32(upper_thr));
-
- // mc <= lower_thr
- mask1 = vcleq_u32(mc, vdupq_n_u32(lower_thr));
-
- // mc <= upper_thr && mc > lower_thr
- mask2 = vcleq_u32(mc, vdupq_n_u32(upper_thr));
- mask2 = vandq_u32(mask2, vcgtq_u32(mc, vdupq_n_u32(lower_thr)));
-
- mc = vbslq_u32(mask0, vdupq_n_u32(EDGE), mc);
- mc = vbslq_u32(mask1, vdupq_n_u32(NO_EDGE), mc);
- mc = vbslq_u32(mask2, vdupq_n_u32(MAYBE), mc);
-
- return vmovn_u32(mc);
-}
-
-/** Computes edge tracing when is called by edge_trace_U8_U8 recursively
- *
- * @param[in] in Pointer to source image. Data type supported U8
- * @param[out] out Pointer to destination image. Data type supported U8
- * @param[in] in_stride Stride of the input image
- * @param[in] out_stride Stride of the output image
- */
-void edge_trace_recursive_U8_U8(uint8_t *__restrict in, uint8_t *__restrict out, const int32_t in_stride, const int32_t out_stride)
-{
- // Look for MAYBE pixels in 8 directions
- *out = EDGE;
-
- // (-1, 0)
- uint8_t pixel = *(in - 1);
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *(in - 1) = EDGE;
-
- edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
- }
-
- // (+1, 0)
- pixel = *(in + 1);
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *(in + 1) = EDGE;
-
- edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
- }
-
- in -= in_stride;
- out -= out_stride;
-
- // (-1, -1)
- pixel = *(in - 1);
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *(in - 1) = EDGE;
-
- edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
- }
-
- // (0, -1)
- pixel = *in;
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *in = EDGE;
-
- edge_trace_recursive_U8_U8(in, out, in_stride, out_stride);
- }
-
- // (+1, -1)
- pixel = *(in + 1);
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *(in + 1) = EDGE;
-
- edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
- }
-
- in += in_stride * 2;
- out += out_stride * 2;
-
- // (-1, +1)
- pixel = *(in - 1);
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *(in - 1) = EDGE;
-
- edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
- }
-
- // (0, +1)
- pixel = *in;
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *in = EDGE;
-
- edge_trace_recursive_U8_U8(in, out, in_stride, out_stride);
- }
-
- // (+1, +1)
- pixel = *(in + 1);
-
- if(pixel == MAYBE)
- {
- // Touched a MAYBE point. MAYBE becomes EDGE
- *(in + 1) = EDGE;
-
- edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
- }
-}
-} // namespace fp16
-
-void NEGradientFP16Kernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
-
- set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
- set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
-
- Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
- set_format_if_unknown(*magnitude->info(), magnitude_format);
- set_format_if_unknown(*phase->info(), Format::U8);
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
- ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy");
-
- _gx = gx;
- _gy = gy;
- _magnitude = magnitude;
- _phase = phase;
-
- if(_gx->info()->data_type() == DataType::S16)
- {
- if(norm_type == 1)
- {
- _func = &fp16::mag_phase_l1norm_S16_S16_U16_U8;
- }
- else
- {
- _func = &fp16::mag_phase_l2norm_S16_S16_U16_U8;
- }
- }
- else
- {
- if(norm_type == 1)
- {
- _func = &fp16::mag_phase_l1norm_S32_S32_U32_U8;
- }
- else
- {
- _func = &fp16::mag_phase_l2norm_S32_S32_U32_U8;
- }
- }
-
- constexpr unsigned int num_elems_processed_per_iteration = 32;
-
- // Configure kernel window
- Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
-
- mag_access.set_valid_region(win, _gx->info()->valid_region());
- phase_access.set_valid_region(win, _gx->info()->valid_region());
-
- INEKernel::configure(win);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
namespace
{
inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
new file mode 100644
index 0000000..f8217d3
--- /dev/null
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input,
+ 1,
+ DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC);
+
+ const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+ ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels); // There cannot be more groups than channels
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ }
+
+ return Status{};
+}
+void channel_shuffle_nhwc(const ITensor *input, ITensor *output, unsigned int num_groups, const Window &window)
+{
+ const DataLayout data_layout = input->info()->data_layout();
+ const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ const size_t element_size = input->info()->element_size();
+ const unsigned int K = input->info()->dimension(channel_idx) / num_groups;
+ const float rK = 1.f / K;
+
+ Iterator in(input, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Shuffle channel
+ const unsigned int curr_channel = id.x();
+ const unsigned int group_id = curr_channel * rK;
+ const unsigned int r = group_id * K;
+ const unsigned int channel_id = curr_channel - r;
+
+ // Calculate output coordinates
+ Coordinates out_coords = id;
+ out_coords.set(Window::DimX, channel_id * num_groups + group_id);
+ std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords));
+ },
+ in);
+}
+void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int num_groups, const Window &window)
+{
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ const DataLayout data_layout = input->info()->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ const unsigned int height = input->info()->tensor_shape().y();
+ const size_t input_stride_y = input->info()->strides_in_bytes().y();
+ const size_t output_stride_y = output->info()->strides_in_bytes().y();
+ const size_t row_size = input->info()->dimension(width_idx) * input->info()->element_size();
+
+ const unsigned int K = input->info()->dimension(channel_idx) / num_groups;
+ const float rK = 1.f / K;
+
+ Iterator in(input, win);
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ // Shuffle channel
+ const unsigned int curr_channel = id.z();
+ const unsigned int group_id = curr_channel * rK;
+ const unsigned int r = group_id * K;
+ const unsigned int channel_id = curr_channel - r;
+
+ // Calculate output coordinates
+ Coordinates out_coords = id;
+ out_coords.set(Window::DimZ, channel_id * num_groups + group_id);
+ const uint8_t *input_ptr = in.ptr();
+ uint8_t *output_ptr = output->ptr_to_element(out_coords);
+
+ // Copy plane
+ for(unsigned int y = 0; y < height; ++y)
+ {
+ std::copy_n(input_ptr, row_size, output_ptr);
+ input_ptr += input_stride_y;
+ output_ptr += output_stride_y;
+ }
+ },
+ in);
+}
+} // namespace
+
+NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel()
+ : _input(nullptr), _output(nullptr), _num_groups()
+{
+}
+
+void NEChannelShuffleLayerKernel::configure(const ITensor *input, ITensor *output, unsigned int num_groups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+
+ _input = input;
+ _output = output;
+ _num_groups = num_groups;
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+
+ // The NEChannelShuffleLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ INEKernel::configure(win);
+}
+
+Status NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
+ return Status{};
+}
+
+void NEChannelShuffleLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ switch(_input->info()->data_layout())
+ {
+ case DataLayout::NHWC:
+ channel_shuffle_nhwc(_input, _output, _num_groups, window);
+ break;
+ case DataLayout::NCHW:
+ channel_shuffle_nchw(_input, _output, _num_groups, window);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data layout!");
+ break;
+ }
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index bb8e758..d6517ac 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -29,26 +29,17 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include <arm_neon.h>
#include <cstddef>
#include <cstdint>
using namespace arm_compute;
+using namespace misc::shape_calculator;
namespace
{
-TensorShape get_output_shape(const ITensorInfo *input, const Size2D &convolved_dims)
-{
- TensorShape output_shape = input->tensor_shape();
- output_shape.set(0, convolved_dims.width);
- output_shape.set(1, convolved_dims.height);
- output_shape.set(2, input->tensor_shape()[0]);
- output_shape.set(3, input->tensor_shape()[3]); // For NEON the batch size is on the fourth dimension of the input tensor
-
- return output_shape;
-}
-
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
{
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
@@ -60,12 +51,28 @@
// Validate configured output
if(output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, convolved_dims));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, false));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
return Status{};
}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &convolved_dims)
+{
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, false)));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps());
+
+ // The NECol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+ return std::make_pair(Status{}, win);
+}
} // namespace
template <typename T>
@@ -102,11 +109,6 @@
void NECol2ImKernel::configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(get_output_shape(input->info(), convolved_dims)));
-
- // Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), convolved_dims));
_input = input;
@@ -130,19 +132,15 @@
}
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
-
- // The NECol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
- INEKernel::configure(win);
+ auto win_config = validate_and_configure_window(input->info(), output->info(), convolved_dims);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
}
Status NECol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), convolved_dims).first);
return Status{};
}
diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp
index 4582c88..7a66b6c 100644
--- a/src/core/NEON/kernels/NEColorConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp
@@ -112,6 +112,10 @@
_func = colorconvert_rgb_to_rgbx;
num_elems_processed_per_iteration = 16;
break;
+ case Format::U8:
+ _func = colorconvert_rgb_to_u8;
+ num_elems_processed_per_iteration = 16;
+ break;
default:
ARM_COMPUTE_ERROR("Not supported");
break;
diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
index d2eac2c..31b688c 100644
--- a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
@@ -102,7 +102,7 @@
}
else
{
- const float diff = image_size - 1;
+ const float diff = image_size - cd_min;
for(unsigned int x = 0; x < _histogram_size; ++x)
{
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index 8280b52..158f401 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
+#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
@@ -34,68 +35,90 @@
using namespace arm_compute;
-namespace arm_compute
+namespace
{
-class Coordinates;
-} // namespace arm_compute
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+ ARM_COMPUTE_UNUSED(policy);
+ ARM_COMPUTE_RETURN_ERROR_ON(input == output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
+ && output->data_type() != DataType::S32),
+ "Only data_types supported [in] U8 -> [out] U16, S16, S32");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32),
+ "Only data_types supported [in] U16 -> [out] U8, U32");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::S32),
+ "Only data_types supported [in] S16 -> [out] U8, S32");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && output->data_type() != DataType::F32,
+ "Only data_types supported [in] F16 -> [out] F32");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && output->data_type() != DataType::F16,
+ "Only data_types supported [in] F32 -> [out] F16");
+
+ // Validate in case of configured output
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, output->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
NEDepthConvertLayerKernel::NEDepthConvertLayerKernel()
: _input(nullptr), _output(nullptr), _policy(), _shift(0)
{
}
-void NEDepthConvertLayerKernel::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEDepthConvertLayerKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
_input = input;
- _output = input;
+ _output = output;
_policy = policy;
_shift = shift;
- if(output != nullptr)
- {
- // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
- set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
- // Set output
- _output = output;
- }
-
- ARM_COMPUTE_ERROR_ON(shift >= 8);
- ARM_COMPUTE_ERROR_ON(input == output && (data_size_from_type(input->info()->data_type()) != data_size_from_type(output->info()->data_type())));
-
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
- && output->info()->data_type() != DataType::S32),
- "Only data_types supported [in] U8 -> [out] U16, S16, S32");
-
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
- "Only data_types supported [in] U16 -> [out] U8, U32");
-
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
- "Only data_types supported [in] S16 -> [out] U8, S32");
-
- constexpr unsigned int num_elems_processed_per_iteration = 16;
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, shift));
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICPPKernel::configure(win_config.second);
+}
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- if(output != nullptr)
- {
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->info()->valid_region());
- }
- else
- {
- // In-place computation
- update_window_and_padding(win, input_access);
- }
- ICPPKernel::configure(win);
+Status NEDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+ return Status{};
}
void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info)
@@ -103,8 +126,7 @@
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(nullptr == _input);
- ARM_COMPUTE_ERROR_ON(nullptr == _output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output);
ARM_COMPUTE_ERROR_ON(_input == _output);
Iterator input(_input, window);
@@ -341,6 +363,68 @@
}
break;
}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ switch(_output->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ const float32x4_t scale = vdupq_n_f32(1 << _shift);
+
+ /* Up-conversion F16 -> F32 */
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float16x8x2_t texels =
+ {
+ {
+ vld1q_f16(reinterpret_cast<float16_t *>(input.ptr())),
+ vld1q_f16(reinterpret_cast<float16_t *>(input.ptr()) + 8)
+ }
+ };
+
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale));
+ },
+ input, output);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Output data type not supported");
+ }
+ break;
+ case DataType::F32:
+ switch(_output->info()->data_type())
+ {
+ case DataType::F16:
+ {
+ const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
+
+ /* Down-conversion F32 -> F16 */
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float32x4x4_t texels =
+ {
+ {
+ vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale),
+ vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale),
+ vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale),
+ vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale)
+ }
+ };
+
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+ },
+ input, output);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Output data type not supported");
+ }
+ break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
default:
ARM_COMPUTE_ERROR("Not supported");
}
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 09e4acd..99bdb7a 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -25,7 +25,6 @@
#include "arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
#include "arm_compute/core/Coordinates.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -147,7 +146,7 @@
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, bool is_optimized)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
const DataLayout data_layout = input->data_layout();
@@ -166,8 +165,14 @@
const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && (output->data_type() != DataType::S32));
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input->data_type()) && (output->data_type() != DataType::F32));
+ if(is_data_type_quantized_asymmetric(input->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
}
return Status{};
@@ -193,8 +198,10 @@
output_shape.set(1, convolver->output_size(output_shape.y(), same_padding)); // Set width
output_shape.set(2, convolver->output_size(output_shape.z(), same_padding)); // Set height
+ const DataType output_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+
// Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+ auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
// Configure window (optimised)
// Set padding in channels
@@ -230,6 +237,11 @@
case DataType::QASYMM8:
num_elems_read_per_iteration = 16;
break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ num_elems_read_per_iteration = 24;
+ break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
num_elems_read_per_iteration = 12;
break;
@@ -314,7 +326,7 @@
}
// Check supported data type
- bool supported_datatype = (dt == DataType::F32);
+ bool supported_datatype = is_data_type_float(dt) || is_data_type_quantized(dt);
// Check for supported strides
const auto &strides = conv_info.stride();
@@ -335,11 +347,15 @@
void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver()
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(_input, _weights);
ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
_convolver = create_convolver_object(_conv_info, _weights, _input, _output, true);
+ if(_convolver)
+ {
+ _convolver->set_offsets(-_input->info()->quantization_info().offset, -_weights->info()->quantization_info().offset);
+ }
}
void NEDepthwiseConvolutionLayer3x3Kernel::configure_generic()
@@ -372,6 +388,11 @@
switch(_input->info()->data_type())
{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ convolve_3x3<float16_t, float16_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
+ break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
break;
@@ -399,6 +420,7 @@
ITensor *out,
bool setup_strides)
{
+ const DataType dt = in->info()->data_type();
const TensorShape shape = in->info()->tensor_shape();
const int in_rows = shape.z();
const int in_cols = shape.y();
@@ -415,34 +437,85 @@
const int output_batch_stride = (setup_strides) ? out->info()->strides_in_bytes()[3] / out->info()->element_size() : 0;
const auto stride_x = conv_info.stride().first;
- switch(stride_x)
+ switch(dt)
{
- case 1:
- return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>>(
- n_batches,
- in_rows,
- in_cols,
- n_channels,
- padding_same,
- reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
- reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
- reinterpret_cast<float *>(out->ptr_to_element(Coordinates())),
- weight_col_stride, weight_row_stride,
- input_col_stride, input_row_stride, input_batch_stride,
- output_col_stride, output_row_stride, output_batch_stride);
- case 2:
- return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>>(
- n_batches,
- in_rows,
- in_cols,
- n_channels,
- padding_same,
- reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
- reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
- reinterpret_cast<float *>(out->ptr_to_element(Coordinates())),
- weight_col_stride, weight_row_stride,
- input_col_stride, input_row_stride, input_batch_stride,
- output_col_stride, output_row_stride, output_batch_stride);
+ case DataType::QASYMM8:
+ {
+ switch(stride_x)
+ {
+ case 1:
+ return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>>(
+ n_batches, in_rows, in_cols, n_channels, padding_same,
+ reinterpret_cast<const uint8_t *>(w->ptr_to_element(Coordinates())),
+ in->ptr_to_element(Coordinates()),
+ reinterpret_cast<int32_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+ weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+ output_col_stride, output_row_stride, output_batch_stride);
+ case 2:
+ return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>>(
+ n_batches, in_rows, in_cols, n_channels, padding_same,
+ reinterpret_cast<const uint8_t *>(w->ptr_to_element(Coordinates())),
+ in->ptr_to_element(Coordinates()),
+ reinterpret_cast<int32_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+ weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+ output_col_stride, output_row_stride, output_batch_stride);
+ default:
+ return nullptr;
+ }
+ break;
+ }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ {
+ switch(stride_x)
+ {
+ case 1:
+ return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, float16_t, float16_t>>(
+ n_batches, in_rows, in_cols, n_channels, padding_same,
+ reinterpret_cast<const float16_t *>(w->ptr_to_element(Coordinates())),
+ reinterpret_cast<float16_t *>(in->ptr_to_element(Coordinates())),
+ reinterpret_cast<float16_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+ weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+ output_col_stride, output_row_stride, output_batch_stride);
+ case 2:
+ return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 2, 2, float16_t, float16_t>>(
+ n_batches, in_rows, in_cols, n_channels, padding_same,
+ reinterpret_cast<const float16_t *>(w->ptr_to_element(Coordinates())),
+ reinterpret_cast<float16_t *>(in->ptr_to_element(Coordinates())),
+ reinterpret_cast<float16_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+ weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+ output_col_stride, output_row_stride, output_batch_stride);
+ default:
+ return nullptr;
+ }
+ break;
+ }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ {
+ switch(stride_x)
+ {
+ case 1:
+ return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>>(
+ n_batches, in_rows, in_cols, n_channels, padding_same,
+ reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
+ reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
+ reinterpret_cast<float *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+ weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+ output_col_stride, output_row_stride, output_batch_stride);
+ case 2:
+ return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>>(
+ n_batches, in_rows, in_cols, n_channels, padding_same,
+ reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
+ reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
+ reinterpret_cast<float *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+ weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+ output_col_stride, output_row_stride, output_batch_stride);
+ default:
+ return nullptr;
+ }
+ break;
+ }
default:
return nullptr;
}
diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index 92ee8d5..e8fb8cd 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
@@ -23,7 +23,6 @@
*/
#include "arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
#include "arm_compute/core/Coordinates.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
index 2d17c23..921582a 100644
--- a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
@@ -23,7 +23,6 @@
*/
#include "arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Coordinates.h"
#include "arm_compute/core/Error.h"
diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
index 22a2cf8..77ab5ad 100644
--- a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
@@ -23,7 +23,6 @@
*/
#include "arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
#include "arm_compute/core/Coordinates.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp
index 06e6b03..cfed324 100644
--- a/src/core/NEON/kernels/NEDerivativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDerivativeKernel.cpp
@@ -81,9 +81,11 @@
AccessWindowHorizontal out_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration);
AccessWindowHorizontal out_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
+ // TODO(COMPMID-1503) Fix x-access input bug in NEON kernel instead of '+2'
AccessWindowHorizontal in_x_access(input->info(), -border_size().left, num_elems_processed_per_iteration + 2);
AccessWindowRectangle in_y_access(input->info(), 0, -border_size().left, num_elems_processed_per_iteration, num_rows_read_per_iteration);
+ // TODO(COMPMID-1503) Fix x-access input bug in NEON kernel instead of '+2'
AccessWindowRectangle in_xy_access(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration + 2, num_rows_read_per_iteration);
if(run_der_x && run_der_y)
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index f525d93..162c4b1 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -36,6 +36,7 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include <algorithm>
#include <arm_neon.h>
@@ -603,10 +604,9 @@
out_values = internal_vmlal(out_values, in_values, we_values);
}
- out_val += out_values[0];
- out_val += out_values[1];
- out_val += out_values[2];
- out_val += out_values[3];
+ auto carry_addition = wrapper::vpadd(wrapper::vgethigh(out_values), wrapper::vgetlow(out_values));
+ carry_addition = wrapper::vpadd(carry_addition, carry_addition);
+ out_val += wrapper::vgetlane(carry_addition, 0);
// Leftover
for(; x < input_width; ++x)
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index eefbd98..a571d54 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -194,8 +194,8 @@
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
template <typename T1, typename T2, bool in_place, bool has_bias>
-void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
@@ -304,14 +304,14 @@
internal_vst1q(out_ptr, internal_vld1q(in_ptr));
}
},
- in, bi);
+ in, bi, out);
}
}
// QASYMM8 specializations
template <>
-void output_stage<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
uint8x16_t min = vdupq_n_u8(0);
@@ -352,8 +352,8 @@
in, out);
}
template <>
-void output_stage<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
ARM_COMPUTE_UNUSED(bias);
@@ -382,6 +382,85 @@
},
in, out);
}
+template <>
+void output_stage_nhwc<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+ const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+ uint8x16_t min = vdupq_n_u8(0);
+ uint8x16_t max = vdupq_n_u8(255);
+
+ Window window_bias = window;
+ window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ window_bias.set(3, Window::Dimension(0, 0, 0));
+
+ Iterator in(input, window);
+ Iterator bi(bias, window_bias);
+
+ Iterator out(output, window);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
+ const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr());
+
+ // Accumulate bias
+ int32x4x4_t v_in =
+ {
+ {
+ vaddq_s32(vld1q_s32(in_ptr), vld1q_s32(bias_ptr)),
+ vaddq_s32(vld1q_s32(in_ptr + 4), vld1q_s32(bias_ptr + 4)),
+ vaddq_s32(vld1q_s32(in_ptr + 8), vld1q_s32(bias_ptr + 8)),
+ vaddq_s32(vld1q_s32(in_ptr + 12), vld1q_s32(bias_ptr + 12))
+ }
+ };
+
+ const auto out_ptr = out.ptr();
+ vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+ },
+ in, bi, out);
+}
+template <>
+void output_stage_nhwc<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+ ARM_COMPUTE_UNUSED(bias);
+
+ const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+ uint8x16_t min = vdupq_n_u8(0);
+ uint8x16_t max = vdupq_n_u8(255);
+
+ Window window_bias = window;
+ window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ window_bias.set(3, Window::Dimension(0, 0, 0));
+
+ Iterator in(input, window);
+ Iterator bi(bias, window_bias);
+
+ Iterator out(output, window);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
+
+ // Accumulate bias
+ int32x4x4_t v_in =
+ {
+ {
+ vld1q_s32(in_ptr),
+ vld1q_s32(in_ptr + 4),
+ vld1q_s32(in_ptr + 8),
+ vld1q_s32(in_ptr + 12)
+ }
+ };
+
+ const auto out_ptr = out.ptr();
+ vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+ },
+ in, bi, out);
+}
} // namespace
NEDirectConvolutionLayerOutputStageKernel::NEDirectConvolutionLayerOutputStageKernel()
@@ -426,19 +505,19 @@
{
case DataType::S32:
{
- _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
+ _func = (bias == nullptr) ? &output_stage_nchw<int32_t, uint8_t, false, false> : &output_stage_nchw<int32_t, uint8_t, false, true>;
break;
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
- _func = (output == nullptr) ? &output_stage<float16_t, float16_t, true, true> : &output_stage<float16_t, float16_t, false, true>;
+ _func = (output == nullptr) ? &output_stage_nchw<float16_t, float16_t, true, true> : &output_stage_nchw<float16_t, float16_t, false, true>;
break;
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
{
- _func = (output == nullptr) ? &output_stage<float, float, true, true> : &output_stage<float, float, false, true>;
+ _func = (output == nullptr) ? &output_stage_nchw<float, float, true, true> : &output_stage_nchw<float, float, false, true>;
break;
}
default:
@@ -451,6 +530,18 @@
{
switch(input->info()->data_type())
{
+ case DataType::S32:
+ {
+ _func = (output == nullptr) ? &output_stage_nhwc<int32_t, uint8_t, false, false> : &output_stage_nhwc<int32_t, uint8_t, false, true>;
+ break;
+ }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ {
+ _func = (output == nullptr) ? &output_stage_nhwc<float16_t, float16_t, true, true> : &output_stage_nhwc<float16_t, float16_t, false, true>;
+ break;
+ }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
{
_func = (output == nullptr) ? &output_stage_nhwc<float, float, true, true> : &output_stage_nhwc<float, float, false, true>;
diff --git a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
new file mode 100644
index 0000000..b8452fb
--- /dev/null
+++ b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+using namespace misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_flatten_shape(input));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_flatten_shape(input)));
+
+ Window win = calculate_max_window(*input, Steps()); // Flatten does not need paddings
+
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEFlattenLayerKernel::NEFlattenLayerKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void NEFlattenLayerKernel::configure(const ITensor *input, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+ _input = input;
+ _output = output;
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+Status NEFlattenLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+ return Status{};
+}
+
+void NEFlattenLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ const size_t in_width = _input->info()->dimension(0);
+ const size_t in_height = _input->info()->dimension(1);
+ const size_t out_step_x = in_width * _input->info()->element_size();
+ const size_t out_step_y = out_step_x * in_height;
+
+ Window in_window(window);
+ in_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Window out_window;
+ out_window.use_tensor_dimensions(_output->info()->tensor_shape());
+ out_window.set(Window::DimX, Window::Dimension(out_window.x().start(), out_window.x().end(), in_width));
+
+ Window in_slice = in_window.first_slice_window_3D();
+ Window out_slice = out_window.first_slice_window_1D();
+
+ do
+ {
+ Iterator in(_input, in_slice);
+ Iterator out(_output, out_slice);
+
+ uint8_t *out_ptr = out.ptr();
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ memcpy(out_ptr + id.y() * out_step_x + id.z() * out_step_y, in.ptr(), out_step_x);
+ },
+ in);
+ }
+ while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+}
diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
index 872ac26..6551d9e 100644
--- a/src/core/NEON/kernels/NEFloorKernel.cpp
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
+#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Coordinates.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
@@ -33,7 +34,42 @@
#include <arm_neon.h>
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+ // Validate in case of configured output
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ auto_init_if_empty(*output, *input);
+
+ const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
void NEFloorKernel::configure(const ITensor *input, ITensor *output)
{
@@ -42,24 +78,24 @@
// Auto initialize output
auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ // Validate
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
_input = input;
_output = output;
- constexpr unsigned int num_elems_processed_per_iteration = 4;
-
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
- update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->info()->valid_region());
+Status NEFloorKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
- INEKernel::configure(win);
+ return Status{};
}
void NEFloorKernel::run(const Window &window, const ThreadInfo &info)
@@ -68,13 +104,34 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ const DataType data_type = _input->info()->data_type();
+
Iterator input(_input, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ if(data_type == DataType::F32)
{
- const float32x4_t res = vfloorq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())));
- vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
- },
- input, output);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float32x4_t res = vfloorq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
+ },
+ input, output);
+ }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ else if(data_type == DataType::F16)
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float16x8_t res = vfloorq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())));
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
+ },
+ input, output);
+ }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ else
+ {
+ ARM_COMPUTE_ERROR("Invalid data type!");
+ }
}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index af84d02..33a5b4a 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
@@ -62,16 +62,24 @@
if(b_offset != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+ // Check if input is a 3D reinterpretation
+ const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+ // Validate input
+ ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+ ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
TensorShape output_shape = mm_result->tensor_shape();
if(output_shape.num_dimensions() > 1)
{
+ const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
vector_sum_row_shape.collapse_from(1);
- output_shape.collapse_from(2);
+ output_shape.collapse_from(output_batch_idx);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2],
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
"mm_result tensor must have the same number of batches of output tensor");
if(a_offset != 0)
@@ -117,6 +125,217 @@
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
+
+template <bool is_gemm3d>
+void run_offset_contribution(const Window &window,
+ ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row,
+ int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col)
+{
+ Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
+
+ const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
+ const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1;
+
+ if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
+ {
+ // Set window for vector_sum_col
+ Window win_vector_sum_col(collapsed_window);
+ win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ // Set window for vector_sum_row
+ Window win_vector_sum_row(collapsed_window);
+ win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
+ Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
+ Iterator mm_result_it(mm_result, window);
+
+ const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
+
+ // Offset in case vector_sum_col is batched
+ const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+
+ // Compute the leftover term due to a_offset.
+ int32x4x4_t a_offset_term_s32 =
+ {
+ {
+ vld1q_s32(vector_sum_col_ptr + 0),
+ vld1q_s32(vector_sum_col_ptr + 4),
+ vld1q_s32(vector_sum_col_ptr + 8),
+ vld1q_s32(vector_sum_col_ptr + 12)
+ }
+ };
+
+ a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+ a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+ a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+ a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+ // Compute the leftover term due to b_offset.
+ int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y()
+ + (id.z() % depth_input) * height_input);
+ b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset);
+
+ // Add a_offset_term_s32 and b_offset_term_s32
+ int32x4x4_t offset_term_s32 =
+ {
+ {
+ vdupq_n_s32(k_offset),
+ vdupq_n_s32(k_offset),
+ vdupq_n_s32(k_offset),
+ vdupq_n_s32(k_offset)
+ }
+ };
+
+ offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32));
+ offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32));
+ offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32));
+ offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32));
+
+ int32x4x4_t in_s32 =
+ {
+ {
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 12)
+ }
+ };
+
+ // Add the offset terms to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
+
+ // Store the result with the offset contribution
+ vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 0, in_s32.val[0]);
+ vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 4, in_s32.val[1]);
+ vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 8, in_s32.val[2]);
+ vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 12, in_s32.val[3]);
+ },
+ vector_sum_col_it, vector_sum_row_it, mm_result_it);
+ }
+ else if((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
+
+ // Set window for vector_sum_row
+ Window win_vector_sum_row(collapsed_window);
+ win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
+ Iterator mm_result_it(mm_result, window);
+
+ const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const int batch_id = id.z() / depth_input;
+
+ // Compute the leftover term due to b_offset.
+ int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y()
+ + (id.z() % depth_input) * height_input);
+ b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset);
+
+ int32x4x4_t in_s32 =
+ {
+ {
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 12)
+ }
+ };
+
+ // Add the offset terms to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32);
+
+ // Store the result with the offset contribution
+ vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 0, in_s32.val[0]);
+ vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 4, in_s32.val[1]);
+ vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 8, in_s32.val[2]);
+ vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 12, in_s32.val[3]);
+ },
+ vector_sum_row_it, mm_result_it);
+ }
+ else if((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false
+ {
+ // Set window for vector_sum_col
+ Window win_vector_sum_col(collapsed_window);
+ win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+ win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
+ Iterator mm_result_it(mm_result, window);
+
+ // Offset in case vector_sum_col is batched
+ const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+
+ // Compute the leftover term due to a_offset.
+ int32x4x4_t a_offset_term_s32 =
+ {
+ {
+ vld1q_s32(vector_sum_col_ptr + 0),
+ vld1q_s32(vector_sum_col_ptr + 4),
+ vld1q_s32(vector_sum_col_ptr + 8),
+ vld1q_s32(vector_sum_col_ptr + 12)
+ }
+ };
+
+ a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+ a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+ a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+ a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+ int32x4x4_t in_s32 =
+ {
+ {
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 12)
+ }
+ };
+
+ // Add the offset terms to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
+
+ // Store the result with the offset contribution
+ vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 0, in_s32.val[0]);
+ vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 4, in_s32.val[1]);
+ vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 8, in_s32.val[2]);
+ vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 12, in_s32.val[3]);
+ },
+ vector_sum_col_it, mm_result_it);
+ }
+ else // false, false
+ {
+ // No offset contribution from matrix A and matrix B
+ return;
+ }
+}
} // namespace
NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel()
@@ -177,193 +396,17 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimZ);
+ // Check if input is a 3D reinterpretation
+ const bool reinterpret_as_3d = _vector_sum_row != nullptr
+ && _mm_result->info()->num_dimensions() > 1
+ && _mm_result->info()->tensor_shape().y() != _vector_sum_row->info()->tensor_shape().x();
- if(_a_offset != 0 && _b_offset != 0) // true, true
+ if(reinterpret_as_3d)
{
- // Set window for vector_sum_col
- Window win_vector_sum_col(collapsed_window);
- win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
- if(!_slide_vector_sum_col)
- {
- win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
- }
-
- // Set window for vector_sum_row
- Window win_vector_sum_row(collapsed_window);
- win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col);
- Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row);
- Iterator mm_result(_mm_result, window);
-
- const size_t sum_row_stride_y = _vector_sum_row->info()->strides_in_bytes().y();
-
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- // Compute the leftover term due to a_offset.
- int32x4x4_t a_offset_term_s32 =
- {
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 12)
- }
- };
-
- a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset);
- a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset);
- a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset);
- a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset);
-
- // Compute the leftover term due to b_offset.
- int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row.ptr() + id.z() * sum_row_stride_y) + id.y());
- b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, _b_offset);
-
- // Add a_offset_term_s32 and b_offset_term_s32
- int32x4x4_t offset_term_s32 =
- {
- {
- vdupq_n_s32(_k_offset),
- vdupq_n_s32(_k_offset),
- vdupq_n_s32(_k_offset),
- vdupq_n_s32(_k_offset)
- }
- };
-
- offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32));
- offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32));
- offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32));
- offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32));
-
- int32x4x4_t in_s32 =
- {
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
- }
- };
-
- // Add the offset terms to GEMM's result
- in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
- in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
- in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
- in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
-
- // Store the result with the offset contribution
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 0, in_s32.val[0]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 4, in_s32.val[1]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 8, in_s32.val[2]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 12, in_s32.val[3]);
- },
- vector_sum_col, vector_sum_row, mm_result);
+ run_offset_contribution<true>(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col);
}
- else if((_a_offset == 0) && (_b_offset != 0)) // false, true
+ else
{
- // Set window for vector_sum_row
- Window win_vector_sum_row(collapsed_window);
- win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row);
- Iterator mm_result(_mm_result, window);
-
- const size_t sum_row_stride_y = _vector_sum_row->info()->strides_in_bytes().y();
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- // Compute the leftover term due to b_offset.
- int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row.ptr() + id.z() * sum_row_stride_y) + id.y());
- b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, _b_offset);
-
- int32x4x4_t in_s32 =
- {
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
- }
- };
-
- // Add the offset terms to GEMM's result
- in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32);
- in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32);
- in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32);
- in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32);
-
- // Store the result with the offset contribution
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 0, in_s32.val[0]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 4, in_s32.val[1]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 8, in_s32.val[2]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 12, in_s32.val[3]);
- },
- vector_sum_row, mm_result);
+ run_offset_contribution<false>(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col);
}
- else if((_a_offset != 0) && (_b_offset == 0)) // true, false
- {
- // Set window for vector_sum_col
- Window win_vector_sum_col(collapsed_window);
- win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
- if(!_slide_vector_sum_col)
- {
- win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
- }
-
- Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col);
- Iterator mm_result(_mm_result, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- // Compute the leftover term due to a_offset.
- int32x4x4_t a_offset_term_s32 =
- {
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 12)
- }
- };
-
- a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset);
- a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset);
- a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset);
- a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset);
-
- int32x4x4_t in_s32 =
- {
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
- }
- };
-
- // Add the offset terms to GEMM's result
- in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
- in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
- in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
- in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
-
- // Store the result with the offset contribution
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 0, in_s32.val[0]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 4, in_s32.val[1]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 8, in_s32.val[2]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 12, in_s32.val[3]);
- },
- vector_sum_col, mm_result);
- }
- else // false, false
- {
- // No offset contribution from matrix A and matrix B
- return;
- }
-}
+}
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 5e14e1a..024c4f8 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -28,10 +28,12 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include <arm_neon.h>
#include <cstddef>
@@ -58,7 +60,7 @@
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, input);
}
return Status{};
@@ -71,8 +73,11 @@
// For this reason num_elems_processed_per_iteration is set to 1
constexpr unsigned int num_elems_processed_per_iteration = 1;
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
+
// Configure kernel window
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
@@ -81,10 +86,7 @@
if(output->total_size() != 0)
{
- AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, output_result_access);
-
- output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
}
if(bias != nullptr)
@@ -148,12 +150,11 @@
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator in(_input, win);
- Iterator out(_output, win);
-
+ Iterator in(_input, win_collapsed);
+ Iterator out(_output, win_collapsed);
if(_bias != nullptr)
{
Window win_biases;
@@ -161,7 +162,7 @@
win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
Iterator bias(_bias, win_biases);
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win_collapsed, [&](const Coordinates & id)
{
// Compute 16 elements per iteration
int x = window_start_x;
@@ -210,11 +211,11 @@
static_cast<uint8_t>(_max));
}
},
- in, bias, out);
+ in, out, bias);
}
else
{
- execute_window_loop(win, [&](const Coordinates & id)
+ execute_window_loop(win_collapsed, [&](const Coordinates & id)
{
// Compute 16 elements per iteration
int x = window_start_x;
@@ -256,15 +257,7 @@
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8));
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
- (bias != nullptr) ? bias->info() : nullptr,
- output->info(),
- min,
- max));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));
_input = input;
_bias = bias;
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index cd6aa55..757dbbc 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -32,15 +32,27 @@
#include <arm_neon.h>
-using namespace arm_compute;
-
namespace arm_compute
{
-class Coordinates;
-} // namespace arm_compute
-
namespace
{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float beta)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_UNUSED(beta);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+
void matrix_addition_f32(const ITensor *input, ITensor *output, const Window &window, float beta)
{
const float32x4_t beta_f32 = vdupq_n_f32(beta);
@@ -101,12 +113,10 @@
void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta)
{
- ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), beta));
switch(input->info()->data_type())
{
@@ -123,13 +133,21 @@
break;
}
+ // Configure kernel window
constexpr unsigned int num_elems_processed_per_iteration = 16;
-
INESimpleKernel::configure(input, output, num_elems_processed_per_iteration);
_beta = beta;
}
+Status NEGEMMMatrixAdditionKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float beta)
+{
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, beta));
+ ARM_COMPUTE_RETURN_ON_ERROR(INESimpleKernel::validate(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration));
+ return Status{};
+}
+
void NEGEMMMatrixAdditionKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
@@ -141,3 +159,4 @@
(*_func)(_input, _output, window, _beta);
}
}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index 0ca2474..f182fb2 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -24,7 +24,6 @@
#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 5e1c216..61221c1 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
@@ -39,330 +39,6 @@
using namespace arm_compute;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-namespace fp16
-{
-inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh)
-{
- static const float16x8_t zero = vdupq_n_f16(0.f);
-
- // Trace^2
- float16x8_t trace2 = vaddq_f16(gx2, gy2);
- trace2 = vmulq_f16(trace2, trace2);
-
- // Det(A)
- float16x8_t det = vmulq_f16(gx2, gy2);
- det = vfmsq_f16(det, gxgy, gxgy);
-
- // Det(A) - sensitivity * trace^2
- const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2);
-
- // mc > strength_thresh
- const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh));
-
- return vbslq_f16(mask, mc, zero);
-}
-
-template <size_t block_size>
-inline void harris_score1xN_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
- float norm_factor)
-{
- const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor);
-
- // Normalize
- low_gx = vmulq_f16(low_gx, norm_factor_fp16);
- low_gy = vmulq_f16(low_gy, norm_factor_fp16);
- high_gx = vmulq_f16(high_gx, norm_factor_fp16);
- high_gy = vmulq_f16(high_gy, norm_factor_fp16);
-
- float16x8_t gx = vextq_f16(low_gx, high_gx, 0);
- float16x8_t gy = vextq_f16(low_gy, high_gy, 0);
-
- gx2 = vfmaq_f16(gx2, gx, gx);
- gy2 = vfmaq_f16(gy2, gy, gy);
- gxgy = vfmaq_f16(gxgy, gx, gy);
-
- gx = vextq_f16(low_gx, high_gx, 1);
- gy = vextq_f16(low_gy, high_gy, 1);
-
- gx2 = vfmaq_f16(gx2, gx, gx);
- gy2 = vfmaq_f16(gy2, gy, gy);
- gxgy = vfmaq_f16(gxgy, gx, gy);
-
- gx = vextq_f16(low_gx, high_gx, 2);
- gy = vextq_f16(low_gy, high_gy, 2);
-
- gx2 = vfmaq_f16(gx2, gx, gx);
- gy2 = vfmaq_f16(gy2, gy, gy);
- gxgy = vfmaq_f16(gxgy, gx, gy);
-
- if(block_size > 3)
- {
- gx = vextq_f16(low_gx, high_gx, 3);
- gy = vextq_f16(low_gy, high_gy, 3);
-
- gx2 = vfmaq_f16(gx2, gx, gx);
- gy2 = vfmaq_f16(gy2, gy, gy);
- gxgy = vfmaq_f16(gxgy, gx, gy);
-
- gx = vextq_f16(low_gx, high_gx, 4);
- gy = vextq_f16(low_gy, high_gy, 4);
-
- gx2 = vfmaq_f16(gx2, gx, gx);
- gy2 = vfmaq_f16(gy2, gy, gy);
- gxgy = vfmaq_f16(gxgy, gx, gy);
- }
-
- if(block_size == 7)
- {
- gx = vextq_f16(low_gx, high_gx, 5);
- gy = vextq_f16(low_gy, high_gy, 5);
-
- gx2 = vfmaq_f16(gx2, gx, gx);
- gy2 = vfmaq_f16(gy2, gy, gy);
- gxgy = vfmaq_f16(gxgy, gx, gy);
-
- gx = vextq_f16(low_gx, high_gx, 6);
- gy = vextq_f16(low_gy, high_gy, 6);
-
- gx2 = vfmaq_f16(gx2, gx, gx);
- gy2 = vfmaq_f16(gy2, gy, gy);
- gxgy = vfmaq_f16(gxgy, gx, gy);
- }
-}
-
-template <size_t block_size>
-inline void harris_score_S16_S16_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
- float strength_thresh)
-{
- auto gx_ptr_0 = static_cast<const int16_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
- auto gy_ptr_0 = static_cast<const int16_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
- const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
- const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
- const auto output = static_cast<float *__restrict>(out_ptr);
-
- // Gx^2, Gy^2 and Gx*Gy
- float16x8_t gx2 = vdupq_n_f16(0.0f);
- float16x8_t gy2 = vdupq_n_f16(0.0f);
- float16x8_t gxgy = vdupq_n_f16(0.0f);
-
- for(size_t i = 0; i < block_size; ++i)
- {
- const float16x8_t low_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_0));
- const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1));
- const float16x8_t low_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_0));
- const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1));
- harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
-
- // Update gx and gy pointer
- gx_ptr_0 += in_stride;
- gy_ptr_0 += in_stride;
- gx_ptr_1 += in_stride;
- gy_ptr_1 += in_stride;
- }
-
- // Calculate harris score
- const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
-
- // Store score
- vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
- vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
-}
-
-template <size_t block_size>
-inline void harris_score_S32_S32_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
- float strength_thresh)
-{
- static const float16x8_t zero = vdupq_n_f16(0.0f);
-
- auto gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
- auto gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
- const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
- const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
- const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
- const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
- const auto output = static_cast<float *__restrict>(out_ptr);
-
- // Gx^2, Gy^2 and Gx*Gy
- float16x8_t gx2 = zero;
- float16x8_t gy2 = zero;
- float16x8_t gxgy = zero;
-
- for(size_t i = 0; i < block_size; ++i)
- {
- const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
- vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
- const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
- vget_low_f16(zero));
- const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
- vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
- const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
- vget_low_f16(zero));
- harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
-
- // Update gx and gy pointer
- gx_ptr_0 += in_stride;
- gy_ptr_0 += in_stride;
- gx_ptr_1 += in_stride;
- gy_ptr_1 += in_stride;
- gx_ptr_2 += in_stride;
- gy_ptr_2 += in_stride;
- }
-
- // Calculate harris score
- const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
-
- // Store score
- vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
- vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
-}
-
-template <>
-inline void harris_score_S32_S32_FLOAT<7>(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
- float strength_thresh)
-{
- static const float16x8_t zero = vdupq_n_f16(0.0f);
-
- auto gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - 3 * (in_stride + 1);
- auto gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - 3 * (in_stride + 1);
- const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
- const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
- const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
- const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
- const int32_t *gx_ptr_3 = gx_ptr_0 + 12;
- const int32_t *gy_ptr_3 = gy_ptr_0 + 12;
- const auto output = static_cast<float *__restrict>(out_ptr);
-
- // Gx^2, Gy^2 and Gx*Gy
- float16x8_t gx2 = zero;
- float16x8_t gy2 = zero;
- float16x8_t gxgy = zero;
-
- for(size_t i = 0; i < 7; ++i)
- {
- const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
- vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
- const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
- vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_3))));
- const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
- vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
- const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
- vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3))));
- harris_score1xN_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
-
- // Update gx and gy pointer
- gx_ptr_0 += in_stride;
- gy_ptr_0 += in_stride;
- gx_ptr_1 += in_stride;
- gy_ptr_1 += in_stride;
- gx_ptr_2 += in_stride;
- gy_ptr_2 += in_stride;
- }
-
- // Calculate harris score
- const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
-
- // Store score
- vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
- vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
-}
-
-} // namespace fp16
-
-template <int32_t block_size>
-BorderSize NEHarrisScoreFP16Kernel<block_size>::border_size() const
-{
- return _border_size;
-}
-
-template <int32_t block_size>
-NEHarrisScoreFP16Kernel<block_size>::NEHarrisScoreFP16Kernel()
- : INEHarrisScoreKernel(), _func(nullptr)
-{
-}
-
-template <int32_t block_size>
-void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
- Iterator input1(_input1, window);
- Iterator input2(_input2, window);
- Iterator output(_output, window);
-
- const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
- },
- input1, input2, output);
-}
-
-template <int32_t block_size>
-void NEHarrisScoreFP16Kernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
- bool border_undefined)
-{
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
- ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
-
- _input1 = input1;
- _input2 = input2;
- _output = output;
- _sensitivity = sensitivity;
- _strength_thresh = strength_thresh;
- _norm_factor = norm_factor;
- _border_size = BorderSize(block_size / 2);
-
- if(input1->info()->data_type() == DataType::S16)
- {
- _func = &fp16::harris_score_S16_S16_FLOAT<block_size>;
- }
- else
- {
- _func = &fp16::harris_score_S32_S32_FLOAT<block_size>;
- }
-
- ARM_COMPUTE_ERROR_ON(nullptr == _func);
-
- constexpr unsigned int num_elems_processed_per_iteration = 8;
- constexpr unsigned int num_elems_read_per_iteration = 16;
- constexpr unsigned int num_elems_written_per_iteration = 8;
- constexpr unsigned int num_rows_read_per_iteration = block_size;
-
- // Configure kernel window
- Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
- update_window_and_padding(win,
- AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
- AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
- output_access);
-
- ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
- input2->info()->valid_region());
-
- output_access.set_valid_region(win, valid_region, border_undefined, border_size());
-
- INEKernel::configure(win);
-}
-
-template class arm_compute::NEHarrisScoreFP16Kernel<3>;
-template class arm_compute::NEHarrisScoreFP16Kernel<5>;
-template class arm_compute::NEHarrisScoreFP16Kernel<7>;
-
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
template class arm_compute::NEHarrisScoreKernel<3>;
template class arm_compute::NEHarrisScoreKernel<5>;
template class arm_compute::NEHarrisScoreKernel<7>;
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 98b1488..2c51eae 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -41,11 +41,12 @@
#include <tuple>
using namespace arm_compute;
+using namespace misc::shape_calculator;
namespace
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten)
+ bool has_bias, const Size2D &dilation, unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -55,18 +56,7 @@
if(output->total_size() > 0)
{
- TensorShape expected_output_shape;
-
- if(is_flatten || is_fully_connected)
- {
- expected_output_shape = misc::shape_calculator::compute_flatten_shape(input);
- }
- else
- {
- expected_output_shape = misc::shape_calculator::compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false);
- }
-
- TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+ TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -74,23 +64,48 @@
return Status{};
}
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+ bool has_bias, const Size2D &dilation)
+{
+ const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+
+ std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input->dimension(width_idx), input->dimension(height_idx),
+ kernel_dims.width, kernel_dims.height,
+ conv_info, dilation);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false)));
+
+ Window win = calculate_max_window(*input, Steps());
+ win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1));
+ win.set(height_idx, Window::Dimension(0, convolved_dims.second, 1));
+ win.set(channel_idx, Window::Dimension(0, 1, 1));
+
+ // The NEIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ return std::make_pair(Status{}, win);
+}
+
template <typename T, bool has_pads>
-inline void linearize_volume(const uint8_t *const in_ptr,
- T *out_ptr,
- bool has_bias,
- int top_left_x,
- int top_left_y,
- int kernel_width,
- int kernel_height,
- int kernel_depth,
- int input_w,
- int input_h,
- int input_stride_x,
- int input_stride_y,
- int input_stride_z,
- int pad_value,
- int dilation_x,
- int dilation_y)
+inline void linearize_volume_nchw(const uint8_t *const in_ptr,
+ T *out_ptr,
+ bool has_bias,
+ int top_left_x,
+ int top_left_y,
+ int kernel_width,
+ int kernel_height,
+ int kernel_depth,
+ int input_w,
+ int input_h,
+ int input_stride_x,
+ int input_stride_y,
+ int input_stride_z,
+ int pad_value,
+ int dilation_x,
+ int dilation_y)
{
const int kernel_size2 = kernel_width * kernel_height;
const int x_e = top_left_x + kernel_width * dilation_x;
@@ -171,10 +186,63 @@
*out_ptr = static_cast<T>(1);
}
}
-} // namespace
template <typename T, bool has_pads>
-void NEIm2ColKernel::run_generic(const Window &window)
+inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
+ T *out_ptr,
+ bool has_bias,
+ int start_x,
+ int start_y,
+ int kernel_width,
+ int kernel_height,
+ int input_w,
+ int input_h,
+ int input_c,
+ int input_stride_y,
+ int input_stride_z,
+ int pad_value,
+ int dilation_x,
+ int dilation_y)
+{
+ const int end_x = start_x + kernel_width * dilation_x;
+ const int end_y = start_y + kernel_height * dilation_y;
+ const int pad_quant = kernel_width * input_c;
+
+ for(int y = start_y; y < end_y; y += dilation_y)
+ {
+ if(y < 0 || y >= input_h)
+ {
+ memset(out_ptr, pad_value, pad_quant * sizeof(T));
+ out_ptr += pad_quant;
+ }
+ else
+ {
+ for(int x = start_x; x < end_x; x += dilation_x)
+ {
+ if(x < 0 || x >= input_w)
+ {
+ memset(out_ptr, pad_value, input_c * sizeof(T));
+ out_ptr += input_c;
+ }
+ else
+ {
+ memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * sizeof(T));
+ out_ptr += input_c;
+ }
+ }
+ }
+ }
+
+ // Append 1 if the convolution layer has biases
+ if(has_bias)
+ {
+ *out_ptr = static_cast<T>(1);
+ }
+}
+} // namespace
+
+template <typename T, bool has_pads, bool is_nchw>
+void NEIm2ColKernel::run_im2col(const Window &window)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
@@ -184,25 +252,17 @@
const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const int kernel_depth = _input->info()->dimension(channel_idx);
const int input_w = _input->info()->dimension(width_idx);
const int input_h = _input->info()->dimension(height_idx);
- const int input_stride_x = _input->info()->strides_in_bytes()[width_idx];
- const int input_stride_y = _input->info()->strides_in_bytes()[height_idx];
- const int input_stride_z = _input->info()->strides_in_bytes()[channel_idx];
- const int offset = is_data_type_quantized(_input->info()->data_type()) ? _input->info()->quantization_info().offset : 0;
-
- int pad_left = 0;
- int pad_top = 0;
- int stride_x = 0;
- int stride_y = 0;
- pad_left = _conv_info.pad_left();
- pad_top = _conv_info.pad_top();
- std::tie(stride_x, stride_y) = _conv_info.stride();
-
- // Setup input window
- const int start_x = -pad_left;
- const int start_y = -pad_top;
+ const int input_c = _input->info()->dimension(channel_idx);
+ const int input_stride_x = _input->info()->strides_in_bytes().x();
+ const int input_stride_y = _input->info()->strides_in_bytes().y();
+ const int input_stride_z = _input->info()->strides_in_bytes().z();
+ const int pad_left = _conv_info.pad_left();
+ const int pad_top = _conv_info.pad_top();
+ const int stride_x = _conv_info.stride().first;
+ const int stride_y = _conv_info.stride().second;
+ const int pad_value = is_data_type_quantized(_input->info()->data_type()) ? _input->info()->quantization_info().offset : 0;
Window window_in_out(window);
// The first three dimensions of the input and output are increased by the inner loops
@@ -216,94 +276,70 @@
execute_window_loop(window, [&](const Coordinates & id)
{
- const int top_left_x = id[width_idx] * stride_x + start_x;
- const int top_left_y = id[height_idx] * stride_y + start_y;
+ const int start_w = id[width_idx] * stride_x - pad_left;
+ const int start_h = id[height_idx] * stride_y - pad_top;
// Get pointers
const uint8_t *const input_ptr = in.ptr();
auto output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * _output->info()->strides_in_bytes().y());
// Linearize volume
- linearize_volume<T, has_pads>(input_ptr,
- output_ptr,
- _has_bias,
- top_left_x,
- top_left_y,
- static_cast<int>(_kernel_width),
- static_cast<int>(_kernel_height),
- kernel_depth,
- input_w,
- input_h,
- input_stride_x,
- input_stride_y,
- input_stride_z,
- offset,
- _dilation.x(),
- _dilation.y());
+ if(is_nchw)
+ {
+ linearize_volume_nchw<T, has_pads>(input_ptr,
+ output_ptr,
+ _has_bias,
+ start_w,
+ start_h,
+ _kernel_width,
+ _kernel_height,
+ input_c,
+ input_w,
+ input_h,
+ input_stride_x,
+ input_stride_y,
+ input_stride_z,
+ pad_value,
+ _dilation.x(),
+ _dilation.y());
+ }
+ else
+ {
+ linearize_volume_nhwc<T, has_pads>(input_ptr,
+ output_ptr,
+ _has_bias,
+ start_w,
+ start_h,
+ _kernel_width,
+ _kernel_height,
+ input_w,
+ input_h,
+ input_c,
+ input_stride_y,
+ input_stride_z,
+ pad_value,
+ _dilation.x(),
+ _dilation.y());
+ }
},
in, out);
}
-template <typename T>
-void NEIm2ColKernel::run_reduced(const Window &window)
-{
- const size_t in_width = _input->info()->dimension(0);
- const size_t in_height = _input->info()->dimension(1);
- const size_t out_step_x = in_width * _input->info()->element_size();
- const size_t out_step_y = out_step_x * in_height;
- const size_t out_width = _output->info()->dimension(0);
-
- Window in_window(window);
- in_window.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Window out_window;
- out_window.use_tensor_dimensions(_output->info()->tensor_shape());
- out_window.set(Window::DimX, Window::Dimension(out_window.x().start(), out_window.x().end(), in_width));
-
- Window in_slice = in_window.first_slice_window_3D();
- Window out_slice = out_window.first_slice_window_1D();
-
- do
- {
- Iterator in(_input, in_slice);
- Iterator out(_output, out_slice);
-
- uint8_t *out_ptr = out.ptr();
-
- execute_window_loop(in_slice, [&](const Coordinates & id)
- {
- memcpy(out_ptr + id.y() * out_step_x + id.z() * out_step_y, in.ptr(), out_step_x);
- },
- in);
-
- // Add bias
- if(_has_bias)
- {
- *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = static_cast<T>(1);
- }
- }
- while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
-}
-
NEIm2ColKernel::NEIm2ColKernel()
: _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U)
{
}
void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten)
+ bool has_bias, const Size2D &dilation, unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Perform validation step
- ARM_COMPUTE_UNUSED(is_fully_connected, is_flatten);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
ARM_COMPUTE_UNUSED(num_groups);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten));
const DataLayout data_layout = input->info()->data_layout();
const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
_input = input;
_output = output;
@@ -316,33 +352,20 @@
_conv_info, _dilation);
_has_bias = has_bias;
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
- std::tie(stride_x, stride_y) = conv_info.stride();
-
- bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
- && (std::equal(input->info()->tensor_shape().cbegin() + 3,
- input->info()->tensor_shape().cend(),
- output->info()->tensor_shape().cbegin() + 1))
- && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding())
- && ((dilation.x() == 1) && (dilation.y() == 1));
-
- Window window = calculate_max_window(*input->info(), Steps());
-
- if(run_img2col_reduced)
+ if(data_layout == DataLayout::NCHW)
{
switch(_input->info()->data_type())
{
case DataType::F32:
- _func = &NEIm2ColKernel::run_reduced<float>;
+ _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float, false, true> : &NEIm2ColKernel::run_im2col<float, true, true>;
break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- _func = &NEIm2ColKernel::run_reduced<float16_t>;
+ _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float16_t, false, true> : &NEIm2ColKernel::run_im2col<float16_t, true, true>;
break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::QASYMM8:
- _func = &NEIm2ColKernel::run_reduced<qasymm8_t>;
+ _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<qasymm8_t, false, true> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, true>;
break;
default:
ARM_COMPUTE_ERROR("Data type not supported");
@@ -354,35 +377,33 @@
switch(_input->info()->data_type())
{
case DataType::F32:
- _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<float, false> : &NEIm2ColKernel::run_generic<float, true>;
+ _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float, false, false> : &NEIm2ColKernel::run_im2col<float, true, false>;
break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<float16_t, false> : &NEIm2ColKernel::run_generic<float16_t, true>;
+ _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float16_t, false, false> : &NEIm2ColKernel::run_im2col<float16_t, true, false>;
break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::QASYMM8:
- _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<qasymm8_t, false> : &NEIm2ColKernel::run_generic<qasymm8_t, true>;
+ _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<qasymm8_t, false, false> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, false>;
break;
default:
ARM_COMPUTE_ERROR("Data type not supported");
break;
}
- window.set(width_idx, Window::Dimension(0, _convolved_dims.first, 1));
- window.set(height_idx, Window::Dimension(0, _convolved_dims.second, 1));
- window.set(channel_idx, Window::Dimension(0, 1, 1));
}
- // The NEIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- IKernel::configure(window);
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
}
Status NEIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten)
+ bool has_bias, const Size2D &dilation, unsigned int num_groups)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation).first);
return Status{};
}
diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
index 4d3ec46..46b7913 100644
--- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
@@ -23,7 +23,6 @@
*/
#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
index a6e2b00..52dbe26 100644
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -887,9 +887,12 @@
input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
}
+ std::array<uint8_t, mask_size> vals{ {} };
+
execute_window_loop(win, [&](const Coordinates & id)
{
- std::array<uint8_t, mask_size> vals{ {} };
+ // Clear array
+ std::fill(std::begin(vals), std::end(vals), 0);
size_t v = 0;
size_t m = 0;
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index fe6b69c..27af121 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
+#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -61,30 +62,40 @@
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *input_squared, ITensorInfo *output, const NormalizationLayerInfo &norm_info)
{
- unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
- const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
- const unsigned int norm_idx = get_normalization_dimension_index(input->data_layout(), norm_info);
- const unsigned int num_rows = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
- const unsigned int border_width = (norm_idx == 2) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
- BorderSize border_size = BorderSize(0, border_width);
- bool window_changed = false;
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, *input->clone());
+
+ const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+ const unsigned int norm_idx = get_normalization_dimension_index(input->data_layout(), norm_info);
+ const bool is_norm_accross_width = norm_idx == 0;
+
+ const unsigned int border_width = is_norm_accross_width ? num_elems_processed_per_iteration - 1 : 0;
+ const BorderSize border_size = BorderSize(0, border_width);
// Configure window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ bool window_changed = false;
- AccessWindowRectangle input_access(input, -border_size.left, 0, num_elems_read_per_iteration, num_rows);
- AccessWindowRectangle input_squared_access(input_squared, -border_size.left, 0, num_elems_read_per_iteration, num_rows);
+ if(is_norm_accross_width)
+ {
+ AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
+ AccessWindowStatic input_squared_access(input_squared, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
+ window_changed = window_changed || update_window_and_padding(win, input_access, input_squared_access);
+ }
+ else
+ {
+ AccessWindowHorizontal input_access(input, -border_size.left, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input_squared_access(input_squared, -border_size.left, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, input_access, input_squared_access);
+ }
if(output->total_size() != 0)
{
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- window_changed = update_window_and_padding(win, input_access, input_squared_access, output_access);
+ window_changed = window_changed || update_window_and_padding(win, output_access);
output_access.set_valid_region(win, input->valid_region());
}
- else
- {
- window_changed = update_window_and_padding(win, input_access, input_squared_access);
- }
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
@@ -110,8 +121,11 @@
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), input_squared->info(), output->info(), norm_info));
- const unsigned int norm_idx = get_normalization_dimension_index(input->info()->data_layout(), norm_info);
- const unsigned int border_width = (norm_idx == 2) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+
+ const unsigned int norm_idx = get_normalization_dimension_index(input->info()->data_layout(), norm_info);
+ const bool is_norm_accross_width = norm_idx == 0;
+ const unsigned int border_width = is_norm_accross_width ? num_elems_processed_per_iteration - 1 : 0;
_input = input;
_input_squared = input_squared;
@@ -190,12 +204,10 @@
const int dim_y = 1;
const int radius = _norm_info.norm_size() / 2;
- const int total_size = _input->info()->dimension(dim) - 1;
const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
// We account padding across X only and we iterate over rows
const int min_left = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
- const int max_right = (dim == 2) ? total_size : total_size + border_size().left;
- const int min_top = 0;
+ const int max_right = _input->info()->dimension(dim) - 1;
const int max_bottom = _input->info()->dimension(dim_y) - 1;
if(dt == DataType::F32)
@@ -209,7 +221,7 @@
// Get range to normalize
const int current_row = do_2D_norm ? id[dim_y] : 0;
const int current_slice = id[dim];
- const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+ const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0;
const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
const int first_slice = std::max(current_slice - radius, min_left);
const int last_slice = std::min(current_slice + radius, max_right);
@@ -246,7 +258,7 @@
// Get range to normalize
const int current_row = do_2D_norm ? id[dim_y] : 0;
const int current_slice = id[dim];
- const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+ const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0;
const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
const int first_slice = std::max(current_slice - radius, min_left);
const int last_slice = std::min(current_slice + radius, max_right);
diff --git a/src/core/NEON/kernels/NEPermuteKernel.cpp b/src/core/NEON/kernels/NEPermuteKernel.cpp
index 8d3fd88..29e6d50 100644
--- a/src/core/NEON/kernels/NEPermuteKernel.cpp
+++ b/src/core/NEON/kernels/NEPermuteKernel.cpp
@@ -50,7 +50,8 @@
DataType::U16, DataType::S16,
DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm.num_dimensions() == 3 && !(perm[0] == 2 && perm[1] == 0 && perm[2] == 1) && !(perm[0] == 1 && perm[1] == 2 && perm[2] == 0)),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm != PermutationVector{ 2U, 0U, 1U })
+ && (perm != PermutationVector{ 1U, 2U, 0U }),
"Only [2, 0, 1] and [1, 2, 0] permutation is supported");
const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
@@ -89,7 +90,7 @@
Iterator out(_output, window_out);
// CHW -> HWC
- if((_perm.num_dimensions() == 3) && (_perm[0] == 2) && (_perm[1] == 0) && (_perm[2] == 1))
+ if(_perm == PermutationVector{ 2U, 0U, 1U })
{
const int in_row_stride = _input->info()->strides_in_bytes().y() / sizeof(T);
const int in_channel_stride = _input->info()->strides_in_bytes().z() / sizeof(T);
@@ -116,7 +117,7 @@
in, out);
}
// HWC -> CHW
- else if((_perm.num_dimensions() == 3) && (_perm[0] == 1) && (_perm[1] == 2) && (_perm[2] == 0))
+ else if(_perm == PermutationVector{ 1U, 2U, 0U })
{
const int in_col_stride = _input->info()->strides_in_bytes().y() / sizeof(T);
const int in_row_stride = _input->info()->strides_in_bytes().z() / sizeof(T);
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index ad4b8f7..310560b 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -35,6 +35,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "support/ToolchainSupport.h"
@@ -47,18 +48,10 @@
#include <tuple>
using namespace arm_compute;
+using namespace misc::shape_calculator;
namespace
{
-void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
-{
- TensorShape output_shape{ input->tensor_shape() };
- output_shape.set(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH), pooled_w);
- output_shape.set(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT), pooled_h);
-
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
-}
-
template <bool exclude_padding, DataLayout data_layout>
inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
const int pad_x, const int pad_y, const int stride_x, const int stride_y)
@@ -166,7 +159,9 @@
BorderSize &border_size,
unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
{
- // Get data layout
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
+
DataLayout data_layout = input->data_layout();
unsigned int num_elems_read_per_iteration = 0;
unsigned int num_elems_horizontal_window = 0;
@@ -190,7 +185,6 @@
pool_size_x,
pool_size_y,
pad_stride_info);
- auto_init(input, output, pooled_w, pooled_h);
//If it's not squared and optimized will be executed the MxN
num_elems_read_per_iteration = 1;
@@ -206,7 +200,7 @@
case DataType::QASYMM8:
if(is_nhwc)
{
- num_elems_processed_per_iteration = 8;
+ num_elems_processed_per_iteration = 16;
break;
}
switch(pool_size_x)
@@ -277,8 +271,7 @@
{
if(is_nhwc)
{
- const unsigned int vector_size = 16 / input->element_size();
- num_elems_processed_per_iteration = (input->data_type() == DataType::QASYMM8) ? 8 : vector_size;
+ num_elems_processed_per_iteration = 16 / input->element_size();
}
}
@@ -371,9 +364,6 @@
pool_size_y,
pad_stride_info);
- // Output auto initialization if not yet initialized
- auto_init(input->info(), output->info(), pooled_w, pooled_h);
-
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h));
@@ -1561,8 +1551,16 @@
execute_window_loop(window, [&](const Coordinates & id)
{
- const int idx_width = id.y() * pool_stride_x;
- const int idx_height = id.z() * pool_stride_y;
+ const int idx_width = id.y() * pool_stride_x;
+ const int idx_height = id.z() * pool_stride_y;
+ const int pool_limit_y = pool_pad_top - idx_height;
+ const int pool_limit_x = pool_pad_left - idx_width;
+
+ const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
+ const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
+ const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
+ const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
+
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
@@ -1572,21 +1570,10 @@
// Perform pooling
vres = vdupq_n_f16(0.0f);
-
- for(int y = 0; y < pool_size_y; ++y)
+ for(int y = pool_start_y; y < pool_end_y; ++y)
{
- if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+ for(int x = pool_start_x; x < pool_end_x; ++x)
{
- continue;
- }
-
- for(int x = 0; x < pool_size_x; ++x)
- {
- if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
- {
- continue;
- }
-
const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
(y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
@@ -1607,20 +1594,11 @@
else
{
vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
- for(int y = 0; y < pool_size_y; ++y)
+
+ for(int y = pool_start_y; y < pool_end_y; ++y)
{
- if(y + idx_height > window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+ for(int x = pool_start_x; x < pool_end_x; ++x)
{
- continue;
- }
-
- for(int x = 0; x < pool_size_x; ++x)
- {
- if(x + idx_width > window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
- {
- continue;
- }
-
const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
(y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
vres = vmaxq_f16(vres, data);
@@ -1792,8 +1770,16 @@
execute_window_loop(window, [&](const Coordinates & id)
{
- const int idx_width = id.y() * pool_stride_x;
- const int idx_height = id.z() * pool_stride_y;
+ const int idx_width = id.y() * pool_stride_x;
+ const int idx_height = id.z() * pool_stride_y;
+ const int pool_limit_y = pool_pad_top - idx_height;
+ const int pool_limit_x = pool_pad_left - idx_width;
+
+ const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
+ const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
+ const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
+ const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
+
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
@@ -1804,20 +1790,10 @@
// Perform pooling
vres = vdupq_n_f32(0.0f);
- for(int y = 0; y < pool_size_y; ++y)
+ for(int y = pool_start_y; y < pool_end_y; ++y)
{
- if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+ for(int x = pool_start_x; x < pool_end_x; ++x)
{
- continue;
- }
-
- for(int x = 0; x < pool_size_x; ++x)
- {
- if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
- {
- continue;
- }
-
const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
(y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
@@ -1838,20 +1814,10 @@
else
{
vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
- for(int y = 0; y < pool_size_y; ++y)
+ for(int y = pool_start_y; y < pool_end_y; ++y)
{
- if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+ for(int x = pool_start_x; x < pool_end_x; ++x)
{
- continue;
- }
-
- for(int x = 0; x < pool_size_x; ++x)
- {
- if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
- {
- continue;
- }
-
const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
(y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
vres = vmaxq_f32(vres, data);
@@ -1862,8 +1828,7 @@
// Calculate square-root in case of l2 pooling
if(pooling_type == PoolingType::L2)
{
- float32x4_t sqrt_reciprocal = vrsqrteq_f32(vres);
- vres = vmulq_f32(vres, vmulq_f32(vrsqrtsq_f32(vmulq_f32(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
+ vres = vmulq_f32(vres, vinvsqrtq_f32(vres));
}
// Store result
@@ -1986,14 +1951,26 @@
const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
+ const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
+
execute_window_loop(window, [&](const Coordinates & id)
{
- const int idx_width = id.y() * pool_stride_x;
- const int idx_height = id.z() * pool_stride_y;
+ const int idx_width = id.y() * pool_stride_x;
+ const int idx_height = id.z() * pool_stride_y;
+ const int pool_limit_y = pool_pad_top - idx_height;
+ const int pool_limit_x = pool_pad_left - idx_width;
+
+ const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
+ const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
+ const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
+ const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
+
if(pooling_type != PoolingType::MAX)
{
uint32x4_t vres1 = vdupq_n_u32(0);
uint32x4_t vres2 = vdupq_n_u32(0);
+ uint32x4_t vres3 = vdupq_n_u32(0);
+ uint32x4_t vres4 = vdupq_n_u32(0);
// Calculate scale
const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
@@ -2001,63 +1978,50 @@
const float32x4_t scale_v = vdupq_n_f32(scale);
// Perform pooling
- for(int y = 0; y < pool_size_y; ++y)
+ for(int y = pool_start_y; y < pool_end_y; ++y)
{
- if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+ for(int x = pool_start_x; x < pool_end_x; ++x)
{
- continue;
- }
+ const uint8x16_t data = vld1q_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+ (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
- for(int x = 0; x < pool_size_x; ++x)
- {
- if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
- {
- continue;
- }
-
- const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
- (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
-
- const uint16x8_t data_u16 = vmovl_u8(data);
- vres1 = vaddq_u32(vres1, vmovl_u16(vget_low_u16(data_u16)));
- vres2 = vaddq_u32(vres2, vmovl_u16(vget_high_u16(data_u16)));
+ const uint16x8_t data_u16 = vmovl_u8(vget_low_u8(data));
+ const uint16x8_t data2_u16 = vmovl_u8(vget_high_u8(data));
+ vres1 = vaddq_u32(vres1, vmovl_u16(vget_low_u16(data_u16)));
+ vres2 = vaddq_u32(vres2, vmovl_u16(vget_high_u16(data_u16)));
+ vres3 = vaddq_u32(vres3, vmovl_u16(vget_low_u16(data2_u16)));
+ vres4 = vaddq_u32(vres4, vmovl_u16(vget_high_u16(data2_u16)));
}
}
- // Divide by scale
- vres1 = vcvtq_u32_f32(vmulq_f32(vcvtq_f32_u32(vres1), scale_v));
- vres2 = vcvtq_u32_f32(vmulq_f32(vcvtq_f32_u32(vres2), scale_v));
+ // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+ vres1 = vcvtq_u32_f32(vmlaq_f32(half_scale_v, vcvtq_f32_u32(vres1), scale_v));
+ vres2 = vcvtq_u32_f32(vmlaq_f32(half_scale_v, vcvtq_f32_u32(vres2), scale_v));
+ vres3 = vcvtq_u32_f32(vmlaq_f32(half_scale_v, vcvtq_f32_u32(vres3), scale_v));
+ vres4 = vcvtq_u32_f32(vmlaq_f32(half_scale_v, vcvtq_f32_u32(vres4), scale_v));
- uint8x8_t res = vmovn_u16(vcombine_u16(vmovn_u32(vres1), vmovn_u32(vres2)));
+ uint8x8_t res1 = vmovn_u16(vcombine_u16(vmovn_u32(vres1), vmovn_u32(vres2)));
+ uint8x8_t res2 = vmovn_u16(vcombine_u16(vmovn_u32(vres3), vmovn_u32(vres4)));
// Store result
- vst1_u8(output.ptr(), res);
+ vst1_u8(output.ptr(), res1);
+ vst1_u8(output.ptr() + 8, res2);
}
else
{
- uint8x8_t vres = vdup_n_u8(0);
+ uint8x16_t vres = vdupq_n_u8(0);
- for(int y = 0; y < pool_size_y; ++y)
+ for(int y = pool_start_y; y < pool_end_y; ++y)
{
- if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+ for(int x = pool_start_x; x < pool_end_x; ++x)
{
- continue;
- }
-
- for(int x = 0; x < pool_size_x; ++x)
- {
- if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
- {
- continue;
- }
-
- const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
- (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
- vres = vmax_u8(vres, data);
+ const uint8x16_t data = vld1q_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+ (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+ vres = vmaxq_u8(vres, data);
}
}
// Store result
- vst1_u8(output.ptr(), vres);
+ vst1q_u8(output.ptr(), vres);
}
},
input, output);
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
new file mode 100644
index 0000000..2f63179
--- /dev/null
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, input2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+
+ // Check variances
+ const int var_size = info.variances().size();
+ if(var_size > 1)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
+ for(int i = 0; i < var_size; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
+ }
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
+
+ if(!info.max_sizes().empty())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+ }
+
+ for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+ }
+
+ if(output != nullptr && output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(input1->data_layout(), DataLayoutDimension::HEIGHT)) != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info)
+{
+ ARM_COMPUTE_UNUSED(input2);
+
+ Window win = {};
+ bool window_changed = false;
+ switch(input1->data_layout())
+ {
+ case DataLayout::NCHW:
+ {
+ const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
+ const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = update_window_and_padding(win, output_access);
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ win = calculate_max_window(*output, Steps());
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ };
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+NEPriorBoxLayerKernel::NEPriorBoxLayerKernel()
+ : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
+{
+}
+
+template <DataLayout DL>
+void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width,
+ const int height)
+{
+ float xmin = (center_x - box_width / 2.f) / width;
+ float ymin = (center_y - box_height / 2.f) / height;
+ float xmax = (center_x + box_width / 2.f) / width;
+ float ymax = (center_y + box_height / 2.f) / height;
+
+ switch(DL)
+ {
+ case DataLayout::NCHW:
+ {
+ float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
+ if(_info.clip())
+ {
+ static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
+ static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
+ vec_elements = vmaxq_f32(vminq_f32(vec_elements, CONST_1), CONST_0);
+ }
+ vst1q_f32(out + offset, vec_elements);
+ }
+ break;
+ case DataLayout::NHWC:
+ {
+ const int output_offset = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();
+ if(_info.clip())
+ {
+ xmin = std::min(std::max(xmin, 0.f), 1.f);
+ ymin = std::min(std::max(ymin, 0.f), 1.f);
+ xmax = std::min(std::max(xmax, 0.f), 1.f);
+ ymax = std::min(std::max(ymax, 0.f), 1.f);
+ }
+
+ *(out + output_offset * offset) = xmin;
+ *(out + output_offset * (offset + 1)) = ymin;
+ *(out + output_offset * (offset + 2)) = xmax;
+ *(out + output_offset * (offset + 3)) = ymax;
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+}
+
+template <DataLayout DL>
+void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
+{
+ const int num_priors = _info.aspect_ratios().size() * _info.min_sizes().size() + _info.max_sizes().size();
+
+ const int width_idx = get_data_layout_dimension_index(DL, DataLayoutDimension::WIDTH);
+ const int height_idx = get_data_layout_dimension_index(DL, DataLayoutDimension::HEIGHT);
+
+ const int layer_width = _input1->info()->dimension(width_idx);
+ const int layer_height = _input1->info()->dimension(height_idx);
+
+ int img_width = _info.img_size().x;
+ int img_height = _info.img_size().y;
+ if(img_width == 0 || img_height == 0)
+ {
+ img_width = _input2->info()->dimension(width_idx);
+ img_height = _input2->info()->dimension(height_idx);
+ }
+
+ float step_x = _info.steps()[0];
+ float step_y = _info.steps()[1];
+ if(step_x == 0.f || step_y == 0.f)
+ {
+ step_x = static_cast<float>(img_width) / layer_width;
+ step_y = static_cast<float>(img_height) / layer_height;
+ }
+
+ Window slice = {};
+
+ switch(DL)
+ {
+ case DataLayout::NCHW:
+ slice = window.first_slice_window_2D();
+ slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
+ break;
+ case DataLayout::NHWC:
+ slice = window.first_slice_window_3D();
+ slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 4 * num_priors));
+ slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), 2));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+
+ Iterator output(_output, slice);
+ execute_window_loop(slice, [&](const Coordinates & id)
+ {
+ float center_x = 0;
+ float center_y = 0;
+ int idx = 0;
+ switch(DL)
+ {
+ case DataLayout::NCHW:
+ idx = id.x() / (4 * num_priors);
+ center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
+ center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
+ break;
+ case DataLayout::NHWC:
+ idx = id.y() / (4 * num_priors);
+ center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
+ center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+
+ float box_width;
+ float box_height;
+ int offset = 0;
+
+ auto out = reinterpret_cast<float *>(output.ptr());
+ for(unsigned int i = 0; i < _info.min_sizes().size(); ++i)
+ {
+ const float min_size = _info.min_sizes().at(i);
+ box_width = min_size;
+ box_height = min_size;
+ store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+ offset += 4;
+
+ if(!_info.max_sizes().empty())
+ {
+ const float max_size = _info.max_sizes().at(i);
+ box_width = std::sqrt(min_size * max_size);
+ box_height = box_width;
+
+ store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+ offset += 4;
+ }
+
+ // rest of priors
+ for(auto ar : _info.aspect_ratios())
+ {
+ if(fabs(ar - 1.) < 1e-6)
+ {
+ continue;
+ }
+
+ box_width = min_size * sqrt(ar);
+ box_height = min_size / sqrt(ar);
+
+ store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+ offset += 4;
+ }
+ }
+
+ // set the variance
+ switch(DL)
+ {
+ case DataLayout::NCHW:
+ {
+ out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
+ float32x4_t var;
+ if(_info.variances().size() == 1)
+ {
+ var = vdupq_n_f32(_info.variances().at(0));
+ }
+ else
+ {
+ const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
+ var = vars;
+ }
+ for(int i = 0; i < num_priors; ++i)
+ {
+ vst1q_f32(out + 4 * i, var);
+ }
+ }
+ break;
+ case DataLayout::NHWC:
+ {
+ for(int i = 0; i < num_priors; ++i)
+ {
+ const int prior_offset = 4 * i;
+ const bool single_var = _info.variances().size() == 1;
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 0, 1)))) = _info.variances().at(0);
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 1, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(1);
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 2, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(2);
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 3, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(3);
+ }
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+
+ },
+ output);
+}
+
+void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), info));
+
+ _input1 = input1;
+ _input2 = input2;
+ _info = info;
+ _output = output;
+
+ switch(input1->info()->data_layout())
+ {
+ case DataLayout::NCHW:
+ {
+ _func = &NEPriorBoxLayerKernel::calculate_prior_boxes<DataLayout::NCHW>;
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ _func = &NEPriorBoxLayerKernel::calculate_prior_boxes<DataLayout::NHWC>;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not implemented.");
+ }
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info(), info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info)
+ .first);
+
+ return Status{};
+}
+void NEPriorBoxLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+ // Run function
+ (this->*_func)(window);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 30f21bb..182e93d 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -32,10 +32,11 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
template <class F>
@@ -57,31 +58,284 @@
Iterator in(input, in_slice);
Iterator out(output, out_slice);
- f(in, out, in_slice, out_slice);
+ f(in, out, in_slice, out_slice, *input->info());
}
- while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
+ while(window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+ }
+ static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f)
+ {
+ // Set in window
+ Window in_window(window);
+
+ in_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ // Get first input and output slices
+ Window in_slice = in_window.first_slice_window_2D();
+ Window out_slice = window.first_slice_window_2D();
+
+ do
+ {
+ Iterator in(input, in_slice);
+ Iterator out(output, out_slice);
+
+ f(in, out, in_slice, out_slice, *input->info(), 1);
+ }
+ while(in_window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ }
+ static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f)
+ {
+ // Set in window
+ Window in_window(window);
+
+ in_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ // Get first input and output slices
+ Window in_slice = in_window.first_slice_window_3D();
+ Window out_slice = window.first_slice_window_3D();
+
+ do
+ {
+ Iterator in(input, in_slice);
+ Iterator out(output, out_slice);
+
+ f(in, out, in_slice, out_slice, *input->info(), 2);
+ }
+ while(in_window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+ }
+ static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f)
+ {
+ // Set in/out window
+ Window in_window(window);
+ Window out_window(window);
+
+ in_window.set(3, Window::Dimension(0, 1, 1));
+ out_window.set(3, Window::Dimension(0, 1, 1));
+
+ // Get first input and output slices
+ Window in_slice = in_window.first_slice_window_4D();
+ Window out_slice = out_window.first_slice_window_4D();
+
+ do
+ {
+ Iterator in(input, in_slice);
+ Iterator out(output, out_slice);
+
+ f(in, out, in_slice, out_slice, *input->info(), 3);
+ }
+ while(in_window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_4D(out_slice));
}
};
-struct SumsqOpX
+template <typename T, int S, ReductionOperation op>
+struct RedOpX
{
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice)
+ /** NEON vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
{
ARM_COMPUTE_UNUSED(out_slice);
- float32x4_t vec_sum_value = vdupq_n_f32(0.f);
+ auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
execute_window_loop(in_slice, [&](const Coordinates & id)
{
- const auto in_ptr = reinterpret_cast<const float *>(input.ptr());
- const float32x4_t vec_elements = vld1q_f32(in_ptr);
- vec_sum_value = vaddq_f32(vmulq_f32(vec_elements, vec_elements), vec_sum_value);
+ const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
+ const auto vec_elements = wrapper::vloadq(in_ptr);
+
+ if(op == ReductionOperation::SUM_SQUARE)
+ {
+ vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
+ }
+ else
+ {
+ vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+ }
},
input);
- float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
- carry_addition = vpadd_f32(carry_addition, carry_addition);
+ auto carry_addition = wrapper::vpadd(wrapper::vgethigh(vec_sum_value), wrapper::vgetlow(vec_sum_value));
+ for(int i = 0; i < S / 4; ++i)
+ {
+ carry_addition = wrapper::vpadd(carry_addition, carry_addition);
+ }
- *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(carry_addition, 0);
+ auto res = wrapper::vgetlane(carry_addition, 0);
+ if(op == ReductionOperation::MEAN_SUM)
+ {
+ res /= in_info.dimension(0);
+ }
+
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ }
+};
+
+template <ReductionOperation op>
+struct RedOpX_qasymm8
+{
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
+ {
+ ARM_COMPUTE_UNUSED(out_slice);
+ auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ const auto vec_elements = wrapper::vloadq(input.ptr());
+
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+ vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
+ vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
+ vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
+ vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+ },
+ input);
+
+ auto carry_addition = wrapper::vadd(vec_sum_value1, vec_sum_value2);
+ carry_addition = wrapper::vadd(carry_addition, vec_sum_value3);
+ carry_addition = wrapper::vadd(carry_addition, vec_sum_value4);
+
+ auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_addition), wrapper::vgetlow(carry_addition));
+ carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
+ auto res = wrapper::vgetlane(carry_paddition, 0);
+
+ if(op == ReductionOperation::MEAN_SUM)
+ {
+ res /= in_info.dimension(0);
+ }
+
+ *(output.ptr()) = static_cast<uint8_t>(res);
+ }
+};
+
+template <typename T, int S, ReductionOperation op>
+struct RedOpYZW
+{
+ /** NEON vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+ {
+ ARM_COMPUTE_UNUSED(out_slice);
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ {
+ T *in_ptr;
+ switch(axis)
+ {
+ case 1:
+ in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim)));
+ break;
+ case 2:
+ in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim)));
+ break;
+ case 3:
+ in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim)));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ const auto vec_elements = wrapper::vloadq(in_ptr);
+
+ if(op == ReductionOperation::SUM_SQUARE)
+ {
+ vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
+ }
+ else
+ {
+ vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+ }
+ }
+
+ if(op == ReductionOperation::MEAN_SUM)
+ {
+ auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
+ vec_sum_value = wrapper::vmul(vec_sum_value, vec_width_inv);
+ }
+
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_sum_value);
+ },
+ input, output);
+ }
+};
+
+template <ReductionOperation op>
+struct RedOpYZW_qasymm8
+{
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+ {
+ ARM_COMPUTE_UNUSED(out_slice);
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+ for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ {
+ uint8_t *in_ptr;
+ switch(axis)
+ {
+ case 1:
+ in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim));
+ break;
+ case 2:
+ in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim));
+ break;
+ case 3:
+ in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ const auto vec_elements = wrapper::vloadq(in_ptr);
+
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+ vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
+ vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
+ vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
+ vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+ }
+
+ if(op == ReductionOperation::MEAN_SUM)
+ {
+ const auto vec_width_inv = wrapper::vinv(vdupq_n_f32(in_info.dimension(axis)));
+ const auto vec_sum_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value1), vec_width_inv);
+ const auto vec_sum_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value2), vec_width_inv);
+ const auto vec_sum_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value3), vec_width_inv);
+ const auto vec_sum_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value4), vec_width_inv);
+
+ vec_sum_value1 = vcvtq_u32_f32(vec_sum_value1_f);
+ vec_sum_value2 = vcvtq_u32_f32(vec_sum_value2_f);
+ vec_sum_value3 = vcvtq_u32_f32(vec_sum_value3_f);
+ vec_sum_value4 = vcvtq_u32_f32(vec_sum_value4_f);
+ }
+
+ const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_sum_value1), wrapper::vqmovn(vec_sum_value2));
+ const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_sum_value3), wrapper::vqmovn(vec_sum_value4));
+ auto res = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+ wrapper::vstore(output.ptr(), res);
+ },
+ input, output);
}
};
@@ -90,7 +344,186 @@
switch(axis)
{
case 0:
- return Reducer<SumsqOpX>::reduceX(window, input, output, SumsqOpX());
+ switch(input->info()->data_type())
+ {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpX<float, 4, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM_SQUARE>());
+ case DataType::QASYMM8:
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ case 1:
+ switch(input->info()->data_type())
+ {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
+ case DataType::QASYMM8:
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ case 2:
+ switch(input->info()->data_type())
+ {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
+ case DataType::QASYMM8:
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ case 3:
+ switch(input->info()->data_type())
+ {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
+ case DataType::QASYMM8:
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported reduction axis");
+ }
+}
+
+void reduce_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
+{
+ switch(axis)
+ {
+ case 0:
+ switch(input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ return Reducer<RedOpX_qasymm8<ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpX<float, 4, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM>());
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ case 1:
+ switch(input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ case 2:
+ switch(input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ case 3:
+ switch(input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported reduction axis");
+ }
+}
+void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
+{
+ switch(axis)
+ {
+ case 0:
+ switch(input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ return Reducer<RedOpX_qasymm8<ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::MEAN_SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpX<float, 4, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::MEAN_SUM>());
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ case 1:
+ switch(input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ case 2:
+ switch(input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ case 3:
+ switch(input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
default:
ARM_COMPUTE_ERROR("Unsupported reduction axis");
}
@@ -109,16 +542,15 @@
ARM_COMPUTE_UNUSED(op);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
@@ -170,10 +602,11 @@
unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
- _input = input;
- _output = output;
- _border_size = (axis == 0) ? BorderSize(0, num_elems_processed_per_iteration - (input->info()->dimension(0) % num_elems_processed_per_iteration), 0, 0) : BorderSize();
- _op = op;
+ _input = input;
+ _output = output;
+ _border_size = (axis == 0) ? BorderSize(0, num_elems_processed_per_iteration - (input->info()->dimension(0) % num_elems_processed_per_iteration), 0, 0) : BorderSize();
+ _op = op;
+ _reduction_axis = axis;
// Configure kernel window
auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
@@ -202,7 +635,14 @@
case ReductionOperation::SUM_SQUARE:
reduce_sumsq(window, _input, _output, _reduction_axis);
break;
+ case ReductionOperation::MEAN_SUM:
+ reduce_mean_sum(window, _input, _output, _reduction_axis);
+ break;
+ case ReductionOperation::SUM:
+ reduce_sum(window, _input, _output, _reduction_axis);
+ break;
default:
ARM_COMPUTE_ERROR("Unsupported reduction operation.");
}
}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index 66115bb..edb3ffe 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp
@@ -113,8 +113,8 @@
AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal mapx_access(map_x->info(), 0, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal mapy_access(map_y->info(), 0, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal mapx_access(map_x->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal mapy_access(map_y->info(), 0, num_elems_processed_per_iteration);
update_window_and_padding(win, input_access, mapx_access, mapy_access, output_access);
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
new file mode 100644
index 0000000..8baea2b
--- /dev/null
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEReorgLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
+{
+ //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
+ DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
+ const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+
+ // Validate output if initialized
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+template <typename T>
+void NEReorgLayerKernel::run_reorg(const Window &window)
+{
+ const DataLayout data_layout = _input->info()->data_layout();
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ const unsigned int stride = _stride;
+ const unsigned int out_c = _output->info()->tensor_shape()[idx_c] / (stride * stride);
+ const uint8_t *in_ptr = _input->buffer();
+
+ // Collapse
+ Window collapsed_window = window.collapse_if_possible(window, 4);
+
+ // Create Iterator
+ Iterator out(_output, collapsed_window);
+
+ // Perform reorg
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
+ {
+ // Get spatial coords and channels
+ const unsigned int w = id[idx_w];
+ const unsigned int h = id[idx_h];
+ const unsigned int c = id[idx_c];
+
+ // Calculate mapping
+ const unsigned int offset = c / out_c;
+ Coordinates map_coords = id;
+ map_coords.set(idx_w, w * stride + offset % stride);
+ map_coords.set(idx_h, h * stride + offset / stride);
+ map_coords.set(idx_c, c % out_c);
+
+ // Perform mapping
+ *(reinterpret_cast<T *>(out.ptr())) = *(reinterpret_cast<const T *>(in_ptr + _input->info()->offset_element_in_bytes(map_coords)));
+ },
+ out);
+}
+
+NEReorgLayerKernel::NEReorgLayerKernel()
+ : _func(nullptr), _input(nullptr), _output(nullptr), _stride(1)
+{
+}
+
+void NEReorgLayerKernel::configure(const ITensor *input, ITensor *output, int32_t stride)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Output auto inizialitation if not yet initialized
+ const TensorShape output_shape = misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride);
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
+
+ _func = nullptr;
+ _input = input;
+ _output = output;
+ _stride = stride;
+
+ switch(input->info()->element_size())
+ {
+ case 1:
+ _func = &NEReorgLayerKernel::run_reorg<uint8_t>;
+ break;
+ case 2:
+ _func = &NEReorgLayerKernel::run_reorg<uint16_t>;
+ break;
+ case 4:
+ _func = &NEReorgLayerKernel::run_reorg<uint32_t>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Element size not supported");
+ break;
+ }
+
+ // The NEReorgLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+
+ ICPPKernel::configure(win);
+}
+
+Status NEReorgLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
+ return Status{};
+}
+
+void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+ if(_func != nullptr)
+ {
+ (this->*_func)(window);
+ }
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
index 8043e8b..c718991 100644
--- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
@@ -35,10 +35,23 @@
#include <cstdint>
+/** [NEReshapeLayerKernel Kernel] **/
using namespace arm_compute;
namespace
{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != output->tensor_shape().total_size());
+
+ return Status{};
+}
+
template <typename T>
inline void reshape_tensor(const Window &window, const ITensor *input, ITensor *output)
{
@@ -59,29 +72,28 @@
void NEReshapeLayerKernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
- DataType::U32, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size());
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
_input = input;
_output = output;
- constexpr unsigned int num_elems_processed_per_iteration = 1;
-
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*input->info());
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowStatic output_access(output->info(), 0, 0, output->info()->tensor_shape().x(), output->info()->tensor_shape().y());
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ // Set the output valid region
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
INEKernel::configure(win);
}
+Status NEReshapeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+
+ return Status{};
+}
+
void NEReshapeLayerKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
@@ -109,3 +121,4 @@
ARM_COMPUTE_ERROR("Unsupported data type!");
}
}
+/** [NEReshapeLayerKernel Kernel] **/
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index 3d19c1d..0f416de 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -282,6 +282,7 @@
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+// TODO (COMPMID-1535) : Revisit FP16 approximations
float16x8_t vexp(const float16x8_t &vec)
{
float16x4x2_t res =
diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index 7ac6cdb..870d2c9 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
@@ -101,13 +102,12 @@
// Configure kernel window
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- AccessWindowStatic input_access(input, 0, 0, input->dimension(0), input->dimension(1));
-
- bool window_changed = update_window_and_padding(win, input_access);
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ bool window_changed = update_window_and_padding(win, input_access);
if(output->total_size() != 0)
{
- AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
+ AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration_y, num_elems_processed_per_iteration_x);
window_changed = window_changed || update_window_and_padding(win, output_access);
diff --git a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
new file mode 100644
index 0000000..5dca58e
--- /dev/null
+++ b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output, int num_elems_processed_per_iteration_x, const Size2D &info)
+{
+ const int num_elems_processed_per_iteration_x_out = num_elems_processed_per_iteration_x * info.x();
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x_out));
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, 1, 0.5f, 0.5f);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x_out);
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, output->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+
+std::pair<Status, Window> validate_and_configure_window_nhwc(ITensorInfo *input, ITensorInfo *output, int num_elems_processed_per_iteration_x, const Size2D &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration_x);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, output->valid_region());
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, int num_elems_processed_per_iteration_x, const Size2D &info)
+{
+ std::pair<Status, Window> win_config;
+ switch(input->data_layout())
+ {
+ case DataLayout::NCHW:
+ win_config = validate_and_configure_window_nchw(input, output, num_elems_processed_per_iteration_x, info);
+ break;
+ case DataLayout::NHWC:
+ win_config = validate_and_configure_window_nhwc(input, output, num_elems_processed_per_iteration_x, info);
+ break;
+ default:
+ win_config = std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported data layout!"), Window{});
+ }
+
+ return win_config;
+}
+} // namespace
+NEUpsampleLayerKernel::NEUpsampleLayerKernel()
+ : _func(nullptr), _input(nullptr), _output(nullptr), _info(), _num_elems_processed_per_iteration_x()
+{
+}
+
+Status NEUpsampleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info, const InterpolationPolicy policy)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_UNUSED(policy);
+
+ const DataLayout data_layout = input->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.x() != 2 || info.y() != 2, "Only stride 2 is supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(policy != InterpolationPolicy::NEAREST_NEIGHBOR, "Only nearest neighbor policy supported");
+
+ // Check output if configured
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_width) != info.x() * input->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_height) != info.y() * input->dimension(idx_height));
+ }
+
+ const int num_elems_processed_per_iteration_x = 16 / input->element_size();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+ output->clone().get(), num_elems_processed_per_iteration_x, info)
+ .first);
+ return Status{};
+}
+
+void NEUpsampleLayerKernel::upsample_f32_nchw(const arm_compute::Window &window)
+{
+ Window window_in(window);
+ window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_x));
+
+ Window window_out(window);
+ window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), _info.y()));
+
+ Iterator input(_input, window_in);
+ Iterator output(_output, window_out);
+ const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float);
+
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr()));
+ const float32x4_t data_out1 = { vgetq_lane_f32(data, 0), vgetq_lane_f32(data, 0), vgetq_lane_f32(data, 1), vgetq_lane_f32(data, 1) };
+ const float32x4_t data_out2 = { vgetq_lane_f32(data, 2), vgetq_lane_f32(data, 2), vgetq_lane_f32(data, 3), vgetq_lane_f32(data, 3) };
+ auto out = reinterpret_cast<float *>(output.ptr());
+
+ vst1q_f32(out, data_out1);
+ vst1q_f32(out + 4, data_out2);
+ vst1q_f32(out + offset_y_out, data_out1);
+ vst1q_f32(out + offset_y_out + 4, data_out2);
+ },
+ input, output);
+}
+
+void NEUpsampleLayerKernel::upsample_f32_nhwc(const arm_compute::Window &window)
+{
+ Window window_out(window);
+ window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), _info.x()));
+ window_out.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), _info.y()));
+
+ Iterator input(_input, window);
+ Iterator output(_output, window_out);
+
+ const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float);
+ const int offset_z_out = _output->info()->strides_in_bytes().z() / sizeof(float);
+
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr()));
+ auto out = reinterpret_cast<float *>(output.ptr());
+
+ vst1q_f32(out, data);
+ vst1q_f32(out + offset_y_out, data);
+ vst1q_f32(out + offset_z_out, data);
+ vst1q_f32(out + offset_y_out + offset_z_out, data);
+ },
+ input, output);
+}
+
+void NEUpsampleLayerKernel::upsample_qasymm8_nchw(const arm_compute::Window &window)
+{
+ Window window_in(window);
+ window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_x));
+
+ Window window_out(window);
+ window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), _info.y()));
+
+ Iterator input(_input, window_in);
+ Iterator output(_output, window_out);
+ const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(uint8_t);
+
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ const uint8x16_t data = vld1q_u8(reinterpret_cast<const uint8_t *>(input.ptr()));
+ const uint8x16_t data_out1 = { vgetq_lane_u8(data, 0), vgetq_lane_u8(data, 0), vgetq_lane_u8(data, 1), vgetq_lane_u8(data, 1),
+ vgetq_lane_u8(data, 2), vgetq_lane_u8(data, 2), vgetq_lane_u8(data, 3), vgetq_lane_u8(data, 3),
+ vgetq_lane_u8(data, 4), vgetq_lane_u8(data, 4), vgetq_lane_u8(data, 5), vgetq_lane_u8(data, 5),
+ vgetq_lane_u8(data, 6), vgetq_lane_u8(data, 6), vgetq_lane_u8(data, 7), vgetq_lane_u8(data, 7)
+ };
+ const uint8x16_t data_out2 =
+ {
+ vgetq_lane_u8(data, 8), vgetq_lane_u8(data, 8), vgetq_lane_u8(data, 9), vgetq_lane_u8(data, 9),
+ vgetq_lane_u8(data, 10), vgetq_lane_u8(data, 10), vgetq_lane_u8(data, 11), vgetq_lane_u8(data, 11),
+ vgetq_lane_u8(data, 12), vgetq_lane_u8(data, 12), vgetq_lane_u8(data, 13), vgetq_lane_u8(data, 13),
+ vgetq_lane_u8(data, 14), vgetq_lane_u8(data, 14), vgetq_lane_u8(data, 15), vgetq_lane_u8(data, 15)
+ };
+ auto out = reinterpret_cast<uint8_t *>(output.ptr());
+
+ vst1q_u8(out, data_out1);
+ vst1q_u8(out + 16, data_out2);
+ vst1q_u8(out + offset_y_out, data_out1);
+ vst1q_u8(out + offset_y_out + 16, data_out2);
+ },
+ input, output);
+}
+
+void NEUpsampleLayerKernel::upsample_qasymm8_nhwc(const arm_compute::Window &window)
+{
+ Window window_out(window);
+ window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), _info.x()));
+ window_out.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), _info.y()));
+
+ Iterator input(_input, window);
+ Iterator output(_output, window_out);
+
+ const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(uint8_t);
+ const int offset_z_out = _output->info()->strides_in_bytes().z() / sizeof(uint8_t);
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ const uint8x16_t data = vld1q_u8(reinterpret_cast<const uint8_t *>(input.ptr()));
+ auto out = reinterpret_cast<uint8_t *>(output.ptr());
+
+ vst1q_u8(out, data);
+ vst1q_u8(out + offset_y_out, data);
+ vst1q_u8(out + offset_z_out, data);
+ vst1q_u8(out + offset_y_out + offset_z_out, data);
+ },
+ input, output);
+}
+
+void NEUpsampleLayerKernel::upsample_f16_nchw(const arm_compute::Window &window)
+{
+ ARM_COMPUTE_UNUSED(window);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ Window window_in(window);
+ window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_x));
+
+ Window window_out(window);
+ window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), _info.y()));
+
+ Iterator input(_input, window_in);
+ Iterator output(_output, window_out);
+ const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float16_t);
+
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr()));
+ const float16x8_t data_out1 = { vgetq_lane_f16(data, 0), vgetq_lane_f16(data, 0), vgetq_lane_f16(data, 1), vgetq_lane_f16(data, 1),
+ vgetq_lane_f16(data, 2), vgetq_lane_f16(data, 2), vgetq_lane_f16(data, 3), vgetq_lane_f16(data, 3)
+ };
+ const float16x8_t data_out2 = { vgetq_lane_f16(data, 4), vgetq_lane_f16(data, 4), vgetq_lane_f16(data, 5), vgetq_lane_f16(data, 5),
+ vgetq_lane_f16(data, 6), vgetq_lane_f16(data, 6), vgetq_lane_f16(data, 7), vgetq_lane_f16(data, 7)
+ };
+ auto out = reinterpret_cast<float16_t *>(output.ptr());
+
+ vst1q_f16(out, data_out1);
+ vst1q_f16(out + 8, data_out2);
+ vst1q_f16(out + offset_y_out, data_out1);
+ vst1q_f16(out + offset_y_out + 8, data_out2);
+ },
+ input, output);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+}
+
+void NEUpsampleLayerKernel::upsample_f16_nhwc(const arm_compute::Window &window)
+{
+ ARM_COMPUTE_UNUSED(window);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ Window window_out(window);
+ window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), _info.x()));
+ window_out.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), _info.y()));
+
+ Iterator input(_input, window);
+ Iterator output(_output, window_out);
+ const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float16_t);
+ const int offset_z_out = _output->info()->strides_in_bytes().z() / sizeof(float16_t);
+
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr()));
+ auto out = reinterpret_cast<float16_t *>(output.ptr());
+
+ vst1q_f16(out, data);
+ vst1q_f16(out + offset_y_out, data);
+ vst1q_f16(out + offset_z_out, data);
+ vst1q_f16(out + offset_y_out + offset_z_out, data);
+ },
+ input, output);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+}
+
+void NEUpsampleLayerKernel::configure(const ITensor *input, ITensor *output, const Size2D &info, const InterpolationPolicy policy)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_UNUSED(policy);
+
+ _input = input;
+ _output = output;
+ _info = info;
+
+ const DataLayout data_layout = input->info()->data_layout();
+
+ TensorShape output_shape = misc::shape_calculator::compute_upsample_shape(*input->info(), info);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+ output->info()->set_data_layout(data_layout);
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(NEUpsampleLayerKernel::validate(input->info(), output->info(), info, policy));
+
+ _num_elems_processed_per_iteration_x = 16 / output->info()->element_size();
+
+ switch(data_layout)
+ {
+ case DataLayout::NCHW:
+ {
+ switch(input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ _func = &NEUpsampleLayerKernel::upsample_qasymm8_nchw;
+ break;
+ case DataType::F32:
+ _func = &NEUpsampleLayerKernel::upsample_f32_nchw;
+ break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ _func = &NEUpsampleLayerKernel::upsample_f16_nchw;
+ break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+ break;
+ }
+ case DataLayout::NHWC:
+ {
+ switch(input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ _func = &NEUpsampleLayerKernel::upsample_qasymm8_nhwc;
+ break;
+ case DataType::F32:
+ _func = &NEUpsampleLayerKernel::upsample_f32_nhwc;
+ break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ _func = &NEUpsampleLayerKernel::upsample_f16_nhwc;
+ break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+
+ // Configure window
+ std::pair<Status, Window> win_config = validate_and_configure_window(input->info(),
+ output->info(),
+ _num_elems_processed_per_iteration_x,
+ info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+void NEUpsampleLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+ (this->*_func)(window);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 2c9ad92..259f4fc 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -34,16 +34,12 @@
namespace
{
-template <typename T, bool is_nhwc>
+template <typename T>
void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
{
- DataLayout data_layout = input->info()->data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const unsigned int kernel_size_x = input->info()->dimension(idx_width);
- const unsigned int kernel_size_y = input->info()->dimension(idx_height);
- const unsigned int kernel_depth = input->info()->dimension(idx_channel);
+ const unsigned int kernel_size_x = input->info()->dimension(0);
+ const unsigned int kernel_size_y = input->info()->dimension(1);
+ const unsigned int kernel_depth = input->info()->dimension(2);
const unsigned int input_stride_x = input->info()->strides_in_bytes().x();
const unsigned int input_stride_y = input->info()->strides_in_bytes().y();
const unsigned int input_stride_z = input->info()->strides_in_bytes().z();
@@ -71,13 +67,13 @@
for(unsigned int i = 0; i < kernel_size_x; ++i)
{
*(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(tmp_input_ptr));
- tmp_input_ptr += is_nhwc ? input_stride_y : input_stride_x;
+ tmp_input_ptr += input_stride_x;
tmp_output_ptr += output_stride_y;
}
- curr_input_row_ptr += is_nhwc ? input_stride_z : input_stride_y;
+ curr_input_row_ptr += input_stride_y;
tmp_input_ptr = curr_input_row_ptr;
}
- curr_input_depth_ptr += is_nhwc ? input_stride_x : input_stride_z;
+ curr_input_depth_ptr += input_stride_z;
curr_input_row_ptr = curr_input_depth_ptr;
tmp_input_ptr = curr_input_depth_ptr;
}
@@ -164,24 +160,21 @@
_bias = bias;
_output = output;
- const DataLayout data_layout = input->info()->data_layout();
- const bool is_nhwc = data_layout == DataLayout::NHWC;
-
switch(_input->info()->element_size())
{
case 4:
{
- _func = is_nhwc ? &weights_reshape<uint32_t, true> : &weights_reshape<uint32_t, false>;
+ _func = &weights_reshape<uint32_t>;
break;
}
case 2:
{
- _func = is_nhwc ? &weights_reshape<uint16_t, true> : &weights_reshape<uint16_t, false>;
+ _func = &weights_reshape<uint16_t>;
break;
}
case 1:
{
- _func = is_nhwc ? &weights_reshape<uint8_t, true> : &weights_reshape<uint8_t, false>;
+ _func = &weights_reshape<uint8_t>;
break;
}
default:
diff --git a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
index 1b38677..a84a6d9 100644
--- a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
@@ -67,7 +67,6 @@
{
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
}
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 3);
return Status{};
}
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
index 3d7a16d..3e76a08 100644
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
@@ -40,19 +40,27 @@
namespace
{
+inline bool is_kernel_size_supported(Size2D size)
+{
+ const std::array<Size2D, 8> supported_input_sizes = { { Size2D(1, 3), Size2D(3, 1), Size2D(5, 5), Size2D(3, 3), Size2D(1, 5), Size2D(5, 1), Size2D(7, 1), Size2D(1, 7) } };
+ return std::end(supported_input_sizes) != std::find(std::begin(supported_input_sizes), std::end(supported_input_sizes), size);
+}
+
Status validate_arguments_winograd_weight_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != 3 && input->dimension(idx_width) != 5);
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != input->dimension(idx_height));
+ const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ const auto input_width = input->dimension(idx_width);
+ const auto input_height = input->dimension(idx_height);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(Size2D(input_width, input_height)), "Only 1x3, 3x1, 1x5, 5x1, 7x1, 1x7, 3x3 and 5x5 kernels are supported");
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
const Size2D &output_tile = winograd_info.output_tile_size;
- ARM_COMPUTE_RETURN_ERROR_ON(output_tile != Size2D(2U, 2U) && output_tile != Size2D(4U, 4U));
+ const std::array<Size2D, 8> supported_tile_sizes = { { Size2D(2U, 2U), Size2D(4U, 4U), Size2D(1U, 6U), Size2D(6U, 1U), Size2D(4, 1), Size2D(1, 4), Size2D(2, 1), Size2D(1, 2) } };
+ ARM_COMPUTE_RETURN_ERROR_ON(std::end(supported_tile_sizes) == std::find(std::begin(supported_tile_sizes), std::end(supported_tile_sizes), output_tile));
// Checks performed when output is configured
if(output->total_size() != 0)
@@ -98,8 +106,8 @@
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != 3U && kernel_dims.width != 5U), "Winograd input transform only supports 3x3 and 5x5 kernels");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != kernel_dims.height), "Winograd input transform only supports 3x3 and 5x5 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(Size2D(kernel_dims.width, kernel_dims.height)),
+ "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported");
// Validate configured output
if(output->total_size() != 0)
@@ -151,9 +159,11 @@
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != num_tiles.area());
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != 3U && kernel_dims.width != 5U), "Winograd output transform only supports 3x3 and 5x5 kernels");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != kernel_dims.height), "Winograd output transform only supports 3x3 and 5x5 kernels");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((input->dimension(2) != size_t(16U)) && (input->dimension(2) != size_t(36U))), "Only 2x2 and 4x4 output tile is supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(Size2D(kernel_dims.width, kernel_dims.height)),
+ "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported");
+
+ const std::array<unsigned int, 3> supported_gemm_sizes = { { 8U, 16U, 36U } };
+ ARM_COMPUTE_RETURN_ERROR_ON(std::end(supported_gemm_sizes) == std::find(std::begin(supported_gemm_sizes), std::end(supported_gemm_sizes), input->dimension(2)));
ARM_COMPUTE_UNUSED(kernel_dims);
if(bias != nullptr)
{
@@ -201,7 +211,21 @@
}
} // namespace
-// Weights transform
+template <typename T>
+Status INEWinogradLayerTransformWeightsKernel<T>::validate(const ITensorInfo *input, const ITensorInfo *weights)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ const DataLayout data_layout = input->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(Size2D(weights->dimension(width_idx), weights->dimension(height_idx))),
+ "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+ return Status{};
+}
+
+template class INEWinogradLayerTransformWeightsKernel<float>;
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
unsigned int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_weight_storage_size(int num_output_channels, int num_input_channels) const
@@ -225,6 +249,7 @@
return WinogradConv::get_kernel_matrix_stride(kernel_shape);
}
+#ifndef DOXYGEN_SKIP_THIS
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
const ITensor *weights_hwio,
@@ -246,6 +271,7 @@
win.set(Window::DimX, Window::Dimension(0, win_last, 1));
INEKernel::configure(win);
}
+#endif /* DOXYGEN_SKIP_THIS */
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
@@ -278,7 +304,13 @@
template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>;
template class NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>;
template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>;
+template class NEWinogradLayerTransformWeightsKernel<float, 1, 6, 1, 3>;
+template class NEWinogradLayerTransformWeightsKernel<float, 6, 1, 3, 1>;
+template class NEWinogradLayerTransformWeightsKernel<float, 1, 4, 1, 5>;
+template class NEWinogradLayerTransformWeightsKernel<float, 4, 1, 5, 1>;
+template class NEWinogradLayerTransformWeightsKernel<float, 1, 2, 1, 7>;
+template class NEWinogradLayerTransformWeightsKernel<float, 2, 1, 7, 1>;
// Input transform
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
@@ -343,14 +375,15 @@
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- const int element_size_in_bytes = _input_nhwc->info()->element_size();
- const int input_col_stride = _input_nhwc->info()->strides_in_bytes().y() / element_size_in_bytes;
- const int input_row_stride = _input_nhwc->info()->strides_in_bytes().z() / element_size_in_bytes;
- const int input_batch_stride = _input_nhwc->info()->strides_in_bytes()[3] / element_size_in_bytes;
-
- InputTransform input_transform(reinterpret_cast<const T *>(_input_nhwc->buffer() + _input_nhwc->info()->offset_first_element_in_bytes()),
+ const int element_size_in_bytes = _input_nhwc->info()->element_size();
+ const int input_col_stride = _input_nhwc->info()->strides_in_bytes().y() / element_size_in_bytes;
+ const int input_row_stride = _input_nhwc->info()->strides_in_bytes().z() / element_size_in_bytes;
+ const int input_batch_stride = _input_nhwc->info()->strides_in_bytes()[3] / element_size_in_bytes;
+ const auto input_nhwc_ptr = reinterpret_cast<const T *>(_input_nhwc->buffer() + _input_nhwc->info()->offset_first_element_in_bytes());
+ auto output_ptr = reinterpret_cast<T *>(_output->buffer() + _output->info()->offset_first_element_in_bytes());
+ InputTransform input_transform(input_nhwc_ptr,
_num_batches, _num_rows, _num_cols, _num_channels, _padding,
- reinterpret_cast<T *>(_output->buffer() + _output->info()->offset_first_element_in_bytes()),
+ output_ptr,
_matrix_stride, _num_channels, input_batch_stride, input_row_stride, input_col_stride);
// The code below cannot be moved to configure because biases hasn't been allocated at that point
@@ -371,6 +404,13 @@
template class NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>;
template class NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>;
template class NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>;
+template class NEWinogradLayerTransformInputKernel<float, 1, 6, 1, 3>;
+template class NEWinogradLayerTransformInputKernel<float, 6, 1, 3, 1>;
+
+template class NEWinogradLayerTransformInputKernel<float, 1, 4, 1, 5>;
+template class NEWinogradLayerTransformInputKernel<float, 4, 1, 5, 1>;
+template class NEWinogradLayerTransformInputKernel<float, 1, 2, 1, 7>;
+template class NEWinogradLayerTransformInputKernel<float, 2, 1, 7, 1>;
// Output transform
@@ -438,7 +478,6 @@
Window win;
auto win_last = output_transform.get_window();
win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-
_output_nhwc->info()->set_valid_region(ValidRegion(Coordinates(), _output_nhwc->info()->tensor_shape()));
INEKernel::configure(win);
@@ -452,10 +491,14 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace);
ARM_COMPUTE_ERROR_ON_NULLPTR(_output_nhwc);
+ const int out_batch_stride = 0;
+ const int out_row_stride = _output_nhwc->info()->strides_in_bytes()[2] / sizeof(T);
+ const int out_col_stride = _output_nhwc->info()->strides_in_bytes()[1] / sizeof(T);
+
OutputTransform output_transform(reinterpret_cast<T *>(_output_workspace->buffer()), _matrix_stride, _matrix_row_stride,
(_biases ? reinterpret_cast<T *>(_biases->buffer() + _biases->info()->offset_first_element_in_bytes()) : nullptr),
reinterpret_cast<T *>(_output_nhwc->buffer() + _output_nhwc->info()->offset_first_element_in_bytes()),
- _num_batches, _num_rows, _num_cols, _num_channels, 0, _output_nhwc->info()->strides_in_bytes()[2] / sizeof(T), _output_nhwc->info()->strides_in_bytes()[1] / sizeof(T));
+ _num_batches, _num_rows, _num_cols, _num_channels, out_batch_stride, out_row_stride, out_col_stride);
// The code below cannot be moved to configure because biases hasn't been allocated at that point
const size_t fst = window.x().start();
@@ -478,5 +521,12 @@
template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>;
template class NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>;
template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>;
+template class NEWinogradLayerTransformOutputKernel<float, 1, 6, 1, 3>;
+template class NEWinogradLayerTransformOutputKernel<float, 6, 1, 3, 1>;
+
+template class NEWinogradLayerTransformOutputKernel<float, 1, 4, 1, 5>;
+template class NEWinogradLayerTransformOutputKernel<float, 4, 1, 5, 1>;
+template class NEWinogradLayerTransformOutputKernel<float, 1, 2, 1, 7>;
+template class NEWinogradLayerTransformOutputKernel<float, 2, 1, 7, 1>;
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
new file mode 100644
index 0000000..009562b
--- /dev/null
+++ b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "arm_compute/core/QAsymm8.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+ ARM_COMPUTE_UNUSED(act_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC);
+
+ const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(num_classes <= 0);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) % (num_classes + 5)) != 0);
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ if(output != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, *input);
+ }
+
+ const bool is_nchw = input->data_layout() == DataLayout::NCHW;
+ const unsigned int num_elems_processed_per_iteration = is_nchw ? 16 / input->element_size() : 1;
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ bool window_changed = false;
+
+ if(output != nullptr)
+ {
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+ }
+ else
+ {
+ window_changed = update_window_and_padding(win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
+ }
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+NEYOLOLayerKernel::NEYOLOLayerKernel()
+ : _func(nullptr), _input(nullptr), _output(nullptr), _act_info(), _num_classes()
+{
+}
+
+void NEYOLOLayerKernel::yolo_layer_fp32_nchw(const Window &window)
+{
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ float32x4_t res = vld1q_f32(reinterpret_cast<float *>(input.ptr()));
+
+ const int box_ch_id = id.z() % (_num_classes + 5);
+ const bool activate = box_ch_id != 2 && box_ch_id != 3;
+
+ // Perform activation
+ if(activate)
+ {
+ auto activation = ::detail::logistic<float, 4>(_act_info);
+ activation(res);
+ }
+
+ // Store results
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
+ },
+ input, output);
+}
+
+void NEYOLOLayerKernel::yolo_layer_fp32_nhwc(const Window &window)
+{
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ float res = *(reinterpret_cast<float *>(input.ptr()));
+
+ const int box_ch_id = id.x() % (_num_classes + 5);
+ const bool activate = box_ch_id != 2 && box_ch_id != 3;
+
+ // Perform activation
+ if(activate)
+ {
+ res = 1.f / (1.f + std::exp(-res));
+ }
+
+ // Store result
+ *(reinterpret_cast<float *>(output.ptr())) = res;
+ },
+ input, output);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+void NEYOLOLayerKernel::yolo_layer_fp16_nchw(const Window &window)
+{
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ float16x8_t res = vld1q_f16(reinterpret_cast<float16_t *>(input.ptr()));
+
+ const int box_ch_id = id.z() % (_num_classes + 5);
+ const bool activate = box_ch_id != 2 && box_ch_id != 3;
+
+ // Perform activation
+ if(activate)
+ {
+ auto activation = ::detail::logistic<float16_t, 8>(_act_info);
+ activation(res);
+ }
+
+ // Store results
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
+ },
+ input, output);
+}
+
+void NEYOLOLayerKernel::yolo_layer_fp16_nhwc(const Window &window)
+{
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ float16_t res = *(reinterpret_cast<float16_t *>(input.ptr()));
+
+ const int box_ch_id = id.x() % (_num_classes + 5);
+ const bool activate = box_ch_id != 2 && box_ch_id != 3;
+
+ // Perform activation
+ if(activate)
+ {
+ res = 1.f / (1.f + std::exp(-res));
+ }
+
+ // Store result
+ *(reinterpret_cast<float16_t *>(output.ptr())) = res;
+ },
+ input, output);
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+void NEYOLOLayerKernel::configure(ITensor *input, ITensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info, num_classes));
+
+ _input = input;
+ _output = output;
+ _act_info = act_info;
+ _num_classes = num_classes;
+
+ switch(_input->info()->data_type())
+ {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ _func = (_input->info()->data_layout() == DataLayout::NHWC) ? &NEYOLOLayerKernel::yolo_layer_fp16_nhwc : &NEYOLOLayerKernel::yolo_layer_fp16_nchw;
+ break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ _func = (_input->info()->data_layout() == DataLayout::NHWC) ? &NEYOLOLayerKernel::yolo_layer_fp32_nhwc : &NEYOLOLayerKernel::yolo_layer_fp32_nchw;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Element size not supported");
+ break;
+ }
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICPPKernel::configure(win_config.second);
+}
+
+Status NEYOLOLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info, num_classes));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output == nullptr) ? nullptr : output->clone().get()).first);
+
+ return Status{};
+}
+
+void NEYOLOLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+ (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index 4579ebd..9194bdd 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -34,10 +34,22 @@
#include "kernels/a64_hgemm_24x8.hpp"
#include "kernels/a64_sgemm_12x8.hpp"
#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/sve_interleaved_fp16_mla_3VLx8.hpp"
namespace arm_gemm {
-#ifdef __aarch64__
+#ifdef __ARM_FEATURE_SVE
+class GemmImpl_gemm_fp16_interleaved_fp16 : public GemmImplementation<__fp16, __fp16> {
+public:
+
+ UniqueGemmCommon<__fp16, __fp16> instantiate(const GemmArgs<__fp16> &args) override {
+ return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args));
+ }
+
+ GemmImpl_gemm_fp16_interleaved_fp16() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED_FP16) { }
+};
+
+#elif defined(__aarch64__)
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS)
class GemmImpl_gemm_fp16_interleaved_fp16 : public GemmImplementation<__fp16, __fp16> {
@@ -73,13 +85,13 @@
GemmImpl_gemm_fp16_interleaved() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED) { }
};
-#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
+#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS) || defined(__ARM_FEATURE_SVE))
static GemmImpl_gemm_fp16_interleaved_fp16 gemm_fp16_interleaved_fp16_impl{};
#endif
static GemmImpl_gemm_fp16_interleaved gemm_fp16_interleaved_impl{};
static std::vector<GemmImplementation<__fp16, __fp16> *> gemm_fp16_methods = {
-#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
+#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS) || defined(__ARM_FEATURE_SVE))
&gemm_fp16_interleaved_fp16_impl,
#endif
&gemm_fp16_interleaved_impl
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index e840e90..7d14971 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -36,10 +36,12 @@
#include "kernels/a64_sgemv_pretransposed.hpp"
#include "kernels/a64_sgemm_native_16x4.hpp"
+#include "kernels/sve_interleaved_fp32_mla_3VLx8.hpp"
+
namespace arm_gemm {
-#ifdef __aarch64__
-// SGEMM implementations for AArch64
+#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
+// SGEMM implementations for AArch64 without SVE
// Pretransposed GEMV
class GemmImpl_sgemm_gemv_pretransposed : public GemmImplementation<float, float> {
@@ -92,7 +94,9 @@
class GemmImpl_sgemm_gemm_interleaved : public GemmImplementation<float, float> {
public:
UniqueGemmCommon<float, float> instantiate(const GemmArgs<float> &args) override {
-#ifdef __aarch64__
+#ifdef __ARM_FEATURE_SVE
+ return UniqueGemmCommon<float, float> (new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args));
+#elif defined(__aarch64__)
return UniqueGemmCommon<float, float> (new GemmInterleaved<sgemm_12x8, float, float>(args));
#elif defined(__arm__)
return UniqueGemmCommon<float, float> (new GemmInterleaved<sgemm_8x6, float, float>(args));
@@ -105,7 +109,7 @@
};
static GemmImpl_gemv_batched<float, float> gemv_batched_impl{};
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
static GemmImpl_sgemm_gemv_pretransposed sgemm_gemv_pretransposed_impl{};
static GemmImpl_sgemm_gemv_native_transposed sgemm_gemv_native_transposed_impl{};
static GemmImpl_sgemm_gemm_native sgemm_gemm_native_impl{};
@@ -115,7 +119,7 @@
/* List of implementations (order matters) */
static std::vector<GemmImplementation<float, float> *> SGemmMethods = {
&gemv_batched_impl,
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
&sgemm_gemv_pretransposed_impl,
&sgemm_gemv_native_transposed_impl,
&sgemm_gemm_native_impl,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index b7e8fa2..ad171a7 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -59,4 +59,4 @@
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index dffa056..627d8ab 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -31,9 +31,21 @@
#include "kernels/a64_gemm_s16_12x8.hpp"
#include "kernels/a64_gemm_s8_12x8.hpp"
#include "kernels/a64_gemm_s8_4x4.hpp"
+#include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
namespace arm_gemm {
+#ifdef __ARM_FEATURE_SVE
+class GemmImpl_gemm_s8_interleaved_dot : public GemmImplementation<int8_t, int32_t> {
+public:
+ UniqueGemmCommon<int8_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
+ return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<interleaved_s8s32_dot_3VLx8, int8_t, int32_t>(args));
+ }
+
+ GemmImpl_gemm_s8_interleaved_dot() : GemmImplementation<int8_t, int32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
+};
+#else
+
class GemmImpl_gemm_s8_interleaved_dot : public GemmImplementation<int8_t, int32_t> {
public:
bool is_supported(const GemmArgs<int32_t> &args) override {
@@ -47,6 +59,8 @@
GemmImpl_gemm_s8_interleaved_dot() : GemmImplementation<int8_t, int32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
};
+#endif
+
class GemmImpl_gemm_s8_interleaved : public GemmImplementation<int8_t, int32_t> {
public:
UniqueGemmCommon<int8_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index bfa4908..0e58a4d 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -450,6 +450,7 @@
return _pretransposed && (_B_transposed==nullptr);
}
+ // TODO: this could almost certainly be considerably simpler.
size_t get_B_pretransposed_array_size() const override {
size_t total=0;
blockwalker current(*this);
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
index 6bc7df0..baa1316 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
@@ -76,7 +76,7 @@
GemmNative(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const unsigned int nbatches, const unsigned int nmultis, const Tr beta) :
_Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmultis(nmultis), _beta(beta), _ci(ci) {
- /* For now don't do any blocking.*/
+ /* For now don't do any blocking. TODO: figure out if we should. */
k_block = K;
n_block = N;
}
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index 60b7954..b7c1bab 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -31,9 +31,20 @@
#include "kernels/a64_gemm_u16_12x8.hpp"
#include "kernels/a64_gemm_u8_12x8.hpp"
#include "kernels/a64_gemm_u8_4x4.hpp"
+#include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
namespace arm_gemm {
+#ifdef __ARM_FEATURE_SVE
+class GemmImpl_gemm_u8_interleaved_dot : public GemmImplementation<uint8_t, uint32_t> {
+public:
+ UniqueGemmCommon<uint8_t, uint32_t> instantiate(const GemmArgs<uint32_t> &args) override {
+ return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<interleaved_u8u32_dot_3VLx8, uint8_t, uint32_t>(args));
+ }
+
+ GemmImpl_gemm_u8_interleaved_dot() : GemmImplementation<uint8_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
+};
+#else
class GemmImpl_gemm_u8_interleaved_dot : public GemmImplementation<uint8_t, uint32_t> {
public:
bool is_supported(const GemmArgs<uint32_t> &args) override {
@@ -46,6 +57,7 @@
GemmImpl_gemm_u8_interleaved_dot() : GemmImplementation<uint8_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
};
+#endif
class GemmImpl_gemm_u8_interleaved : public GemmImplementation<uint8_t, uint32_t> {
public:
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
index e37d4c5..241c5fe 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
@@ -65,7 +65,7 @@
GemvNativeTransposed & operator= (GemvNativeTransposed &) = delete;
GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const Tr beta) : _Nsize(N), _Ksize(K), _nmultis(nmultis), _beta(beta), _ci(ci) {
- /* For now don't do any blocking.*/
+ /* For now don't do any blocking. TODO: figure out if we should. */
m_block = K;
n_block = N;
}
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index d745883..e53ddb2 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -71,7 +71,7 @@
GemvPretransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const bool trB, const Tr beta) :
_Nsize(N), _Ksize(K), _nmultis(nmultis), _trB(trB), _beta(beta), _ci(ci),
_buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave) {
- /* For now don't do any blocking.*/
+ /* For now don't do any blocking. TODO: figure out if we should. */
m_block = K;
n_block = N;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
new file mode 100644
index 0000000..3fd738e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm {
+
+// Actual kernel implementations
+void sve_interleaved_fp16_mla_3VLx8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+
+class interleaved_fp16_mla_3VLx8 {
+public:
+ typedef __fp16 operand_type;
+ typedef __fp16 result_type;
+
+ typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_width()
+ {
+ return svcnth() * 3;
+ }
+
+ static int out_height()
+ {
+ return 8;
+ }
+
+ static int k_unroll()
+ {
+ return 1;
+ }
+
+ // Use the standard fixed size transforms.
+ StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
+
+ kern_type kernel=sve_interleaved_fp16_mla_3VLx8;
+
+ interleaved_fp16_mla_3VLx8(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
new file mode 100644
index 0000000..92ec888
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+ const __fp16 *a_ptr = Apanel;
+ __fp16 *c_ptr = Cpanel;
+
+ const long loops_count = (K / 2) - 1;
+ const long tails_count = K % 2;
+
+ for (int yb=0; yb<ablocks; yb++) {
+ const __fp16 *a_ptr0 = a_ptr;
+ const __fp16 *b_ptr = Bpanel;
+
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ long loops = loops_count;
+ long tails = tails_count;
+
+ __asm __volatile (
+ "mov z8.h, #0\n"
+ "ptrue p0.h\n"
+ "mov z9.h, #0\n"
+ "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
+ "mov z10.h, #0\n"
+ "ld1h z2.h, p0/z, [%[b_ptr]]\n"
+ "mov z11.h, #0\n"
+ "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "mov z12.h, #0\n"
+ "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ "mov z13.h, #0\n"
+ "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
+ "mov z14.h, #0\n"
+ "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
+ "mov z15.h, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x20\n"
+ "mov z16.h, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #6\n"
+ "mov z17.h, #0\n"
+ "mov z18.h, #0\n"
+ "mov z19.h, #0\n"
+ "mov z20.h, #0\n"
+ "mov z21.h, #0\n"
+ "mov z22.h, #0\n"
+ "mov z23.h, #0\n"
+ "mov z24.h, #0\n"
+ "mov z25.h, #0\n"
+ "mov z26.h, #0\n"
+ "mov z27.h, #0\n"
+ "mov z28.h, #0\n"
+ "mov z29.h, #0\n"
+ "mov z30.h, #0\n"
+ "mov z31.h, #0\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "fmla z8.h, z2.h, z0.h[0]\n"
+ "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "fmla z9.h, z2.h, z0.h[1]\n"
+ "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x10]\n"
+ "fmla z10.h, z2.h, z0.h[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z11.h, z2.h, z0.h[3]\n"
+ "fmla z12.h, z2.h, z0.h[4]\n"
+ "fmla z13.h, z2.h, z0.h[5]\n"
+ "fmla z14.h, z2.h, z0.h[6]\n"
+ "fmla z15.h, z2.h, z0.h[7]\n"
+ "ld1h z2.h, p0/z, [%[b_ptr]]\n"
+ "fmla z16.h, z3.h, z0.h[0]\n"
+ "fmla z17.h, z3.h, z0.h[1]\n"
+ "fmla z18.h, z3.h, z0.h[2]\n"
+ "fmla z19.h, z3.h, z0.h[3]\n"
+ "fmla z20.h, z3.h, z0.h[4]\n"
+ "fmla z21.h, z3.h, z0.h[5]\n"
+ "fmla z22.h, z3.h, z0.h[6]\n"
+ "fmla z23.h, z3.h, z0.h[7]\n"
+ "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "fmla z24.h, z4.h, z0.h[0]\n"
+ "fmla z25.h, z4.h, z0.h[1]\n"
+ "fmla z26.h, z4.h, z0.h[2]\n"
+ "fmla z27.h, z4.h, z0.h[3]\n"
+ "fmla z28.h, z4.h, z0.h[4]\n"
+ "fmla z29.h, z4.h, z0.h[5]\n"
+ "fmla z30.h, z4.h, z0.h[6]\n"
+ "fmla z31.h, z4.h, z0.h[7]\n"
+ "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ "fmla z8.h, z5.h, z1.h[0]\n"
+ "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
+ "fmla z9.h, z5.h, z1.h[1]\n"
+ "add %[a_ptr], %[a_ptr], #0x20\n"
+ "fmla z10.h, z5.h, z1.h[2]\n"
+ "addvl %[b_ptr], %[b_ptr], #6\n"
+ "fmla z11.h, z5.h, z1.h[3]\n"
+ "fmla z12.h, z5.h, z1.h[4]\n"
+ "fmla z13.h, z5.h, z1.h[5]\n"
+ "fmla z14.h, z5.h, z1.h[6]\n"
+ "fmla z15.h, z5.h, z1.h[7]\n"
+ "ld1h z5.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+ "fmla z16.h, z6.h, z1.h[0]\n"
+ "fmla z17.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z1.h[2]\n"
+ "fmla z19.h, z6.h, z1.h[3]\n"
+ "fmla z20.h, z6.h, z1.h[4]\n"
+ "fmla z21.h, z6.h, z1.h[5]\n"
+ "fmla z22.h, z6.h, z1.h[6]\n"
+ "fmla z23.h, z6.h, z1.h[7]\n"
+ "ld1h z6.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+ "fmla z24.h, z7.h, z1.h[0]\n"
+ "fmla z25.h, z7.h, z1.h[1]\n"
+ "fmla z26.h, z7.h, z1.h[2]\n"
+ "fmla z27.h, z7.h, z1.h[3]\n"
+ "fmla z28.h, z7.h, z1.h[4]\n"
+ "fmla z29.h, z7.h, z1.h[5]\n"
+ "fmla z30.h, z7.h, z1.h[6]\n"
+ "fmla z31.h, z7.h, z1.h[7]\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[tails], 3f\n"
+ "fmla z8.h, z2.h, z0.h[0]\n"
+ "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "fmla z9.h, z2.h, z0.h[1]\n"
+ "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x10]\n"
+ "fmla z10.h, z2.h, z0.h[2]\n"
+ "fmla z11.h, z2.h, z0.h[3]\n"
+ "fmla z12.h, z2.h, z0.h[4]\n"
+ "fmla z13.h, z2.h, z0.h[5]\n"
+ "fmla z14.h, z2.h, z0.h[6]\n"
+ "fmla z15.h, z2.h, z0.h[7]\n"
+ "ld1h z2.h, p0/z, [%[b_ptr]]\n"
+ "fmla z16.h, z3.h, z0.h[0]\n"
+ "fmla z17.h, z3.h, z0.h[1]\n"
+ "fmla z18.h, z3.h, z0.h[2]\n"
+ "fmla z19.h, z3.h, z0.h[3]\n"
+ "fmla z20.h, z3.h, z0.h[4]\n"
+ "fmla z21.h, z3.h, z0.h[5]\n"
+ "fmla z22.h, z3.h, z0.h[6]\n"
+ "fmla z23.h, z3.h, z0.h[7]\n"
+ "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "fmla z24.h, z4.h, z0.h[0]\n"
+ "fmla z25.h, z4.h, z0.h[1]\n"
+ "fmla z26.h, z4.h, z0.h[2]\n"
+ "fmla z27.h, z4.h, z0.h[3]\n"
+ "fmla z28.h, z4.h, z0.h[4]\n"
+ "fmla z29.h, z4.h, z0.h[5]\n"
+ "fmla z30.h, z4.h, z0.h[6]\n"
+ "fmla z31.h, z4.h, z0.h[7]\n"
+ "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ "fmla z8.h, z5.h, z1.h[0]\n"
+ "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
+ "fmla z9.h, z5.h, z1.h[1]\n"
+ "add %[a_ptr], %[a_ptr], #0x10\n"
+ "fmla z10.h, z5.h, z1.h[2]\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
+ "fmla z11.h, z5.h, z1.h[3]\n"
+ "fmla z12.h, z5.h, z1.h[4]\n"
+ "fmla z13.h, z5.h, z1.h[5]\n"
+ "fmla z14.h, z5.h, z1.h[6]\n"
+ "fmla z15.h, z5.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z1.h[0]\n"
+ "fmla z17.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z1.h[2]\n"
+ "fmla z19.h, z6.h, z1.h[3]\n"
+ "fmla z20.h, z6.h, z1.h[4]\n"
+ "fmla z21.h, z6.h, z1.h[5]\n"
+ "fmla z22.h, z6.h, z1.h[6]\n"
+ "fmla z23.h, z6.h, z1.h[7]\n"
+ "fmla z24.h, z7.h, z1.h[0]\n"
+ "fmla z25.h, z7.h, z1.h[1]\n"
+ "fmla z26.h, z7.h, z1.h[2]\n"
+ "fmla z27.h, z7.h, z1.h[3]\n"
+ "fmla z28.h, z7.h, z1.h[4]\n"
+ "fmla z29.h, z7.h, z1.h[5]\n"
+ "fmla z30.h, z7.h, z1.h[6]\n"
+ "fmla z31.h, z7.h, z1.h[7]\n"
+ "fmla z8.h, z2.h, z0.h[0]\n"
+ "st1h z8.h, p0, [%[c_ptr]]\n"
+ "fmla z9.h, z2.h, z0.h[1]\n"
+ "fmla z10.h, z2.h, z0.h[2]\n"
+ "fmla z11.h, z2.h, z0.h[3]\n"
+ "fmla z12.h, z2.h, z0.h[4]\n"
+ "fmla z13.h, z2.h, z0.h[5]\n"
+ "fmla z14.h, z2.h, z0.h[6]\n"
+ "fmla z15.h, z2.h, z0.h[7]\n"
+ "fmla z16.h, z3.h, z0.h[0]\n"
+ "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
+ "fmla z17.h, z3.h, z0.h[1]\n"
+ "fmla z18.h, z3.h, z0.h[2]\n"
+ "fmla z19.h, z3.h, z0.h[3]\n"
+ "fmla z20.h, z3.h, z0.h[4]\n"
+ "fmla z21.h, z3.h, z0.h[5]\n"
+ "fmla z22.h, z3.h, z0.h[6]\n"
+ "fmla z23.h, z3.h, z0.h[7]\n"
+ "fmla z24.h, z4.h, z0.h[0]\n"
+ "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
+ "fmla z25.h, z4.h, z0.h[1]\n"
+ "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
+ "fmla z26.h, z4.h, z0.h[2]\n"
+ "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
+ "fmla z27.h, z4.h, z0.h[3]\n"
+ "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
+ "fmla z28.h, z4.h, z0.h[4]\n"
+ "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+ "fmla z29.h, z4.h, z0.h[5]\n"
+ "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
+ "fmla z30.h, z4.h, z0.h[6]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
+ "fmla z31.h, z4.h, z0.h[7]\n"
+ "b 4f\n"
+ "3:\n"
+ "fmla z8.h, z2.h, z0.h[0]\n"
+ "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "fmla z9.h, z2.h, z0.h[1]\n"
+ "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x10]\n"
+ "fmla z10.h, z2.h, z0.h[2]\n"
+ "fmla z11.h, z2.h, z0.h[3]\n"
+ "fmla z12.h, z2.h, z0.h[4]\n"
+ "fmla z13.h, z2.h, z0.h[5]\n"
+ "fmla z14.h, z2.h, z0.h[6]\n"
+ "fmla z15.h, z2.h, z0.h[7]\n"
+ "fmla z16.h, z3.h, z0.h[0]\n"
+ "fmla z17.h, z3.h, z0.h[1]\n"
+ "fmla z18.h, z3.h, z0.h[2]\n"
+ "fmla z19.h, z3.h, z0.h[3]\n"
+ "fmla z20.h, z3.h, z0.h[4]\n"
+ "fmla z21.h, z3.h, z0.h[5]\n"
+ "fmla z22.h, z3.h, z0.h[6]\n"
+ "fmla z23.h, z3.h, z0.h[7]\n"
+ "fmla z24.h, z4.h, z0.h[0]\n"
+ "fmla z25.h, z4.h, z0.h[1]\n"
+ "fmla z26.h, z4.h, z0.h[2]\n"
+ "fmla z27.h, z4.h, z0.h[3]\n"
+ "fmla z28.h, z4.h, z0.h[4]\n"
+ "fmla z29.h, z4.h, z0.h[5]\n"
+ "fmla z30.h, z4.h, z0.h[6]\n"
+ "fmla z31.h, z4.h, z0.h[7]\n"
+ "fmla z8.h, z5.h, z1.h[0]\n"
+ "st1h z8.h, p0, [%[c_ptr]]\n"
+ "fmla z9.h, z5.h, z1.h[1]\n"
+ "fmla z10.h, z5.h, z1.h[2]\n"
+ "fmla z11.h, z5.h, z1.h[3]\n"
+ "fmla z12.h, z5.h, z1.h[4]\n"
+ "fmla z13.h, z5.h, z1.h[5]\n"
+ "fmla z14.h, z5.h, z1.h[6]\n"
+ "fmla z15.h, z5.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z1.h[0]\n"
+ "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
+ "fmla z17.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z1.h[2]\n"
+ "fmla z19.h, z6.h, z1.h[3]\n"
+ "fmla z20.h, z6.h, z1.h[4]\n"
+ "fmla z21.h, z6.h, z1.h[5]\n"
+ "fmla z22.h, z6.h, z1.h[6]\n"
+ "fmla z23.h, z6.h, z1.h[7]\n"
+ "fmla z24.h, z7.h, z1.h[0]\n"
+ "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
+ "fmla z25.h, z7.h, z1.h[1]\n"
+ "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
+ "fmla z26.h, z7.h, z1.h[2]\n"
+ "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
+ "fmla z27.h, z7.h, z1.h[3]\n"
+ "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
+ "fmla z28.h, z7.h, z1.h[4]\n"
+ "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+ "fmla z29.h, z7.h, z1.h[5]\n"
+ "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
+ "fmla z30.h, z7.h, z1.h[6]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
+ "fmla z31.h, z7.h, z1.h[7]\n"
+ "4:\n"
+ "st1h z26.h, p0, [%[c_ptr], #-8, MUL VL]\n"
+ "st1h z11.h, p0, [%[c_ptr], #-7, MUL VL]\n"
+ "st1h z19.h, p0, [%[c_ptr], #-6, MUL VL]\n"
+ "st1h z27.h, p0, [%[c_ptr], #-5, MUL VL]\n"
+ "st1h z12.h, p0, [%[c_ptr], #-4, MUL VL]\n"
+ "st1h z20.h, p0, [%[c_ptr], #-3, MUL VL]\n"
+ "st1h z28.h, p0, [%[c_ptr], #-2, MUL VL]\n"
+ "st1h z13.h, p0, [%[c_ptr], #-1, MUL VL]\n"
+ "st1h z21.h, p0, [%[c_ptr]]\n"
+ "st1h z29.h, p0, [%[c_ptr], #1, MUL VL]\n"
+ "st1h z14.h, p0, [%[c_ptr], #2, MUL VL]\n"
+ "st1h z22.h, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1h z30.h, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1h z15.h, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1h z23.h, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1h z31.h, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #8\n"
+ : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+ [loops] "+r" (loops), [tails] "+r" (tails)
+ :
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
new file mode 100644
index 0000000..b2327f3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm {
+
+// Actual kernel implementations
+void sve_interleaved_fp32_mla_3VLx8(const float *, const float *, float *, int, int, int);
+
+class interleaved_fp32_mla_3VLx8 {
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_width()
+ {
+ return svcntw() * 3;
+ }
+
+ static int out_height()
+ {
+ return 8;
+ }
+
+ static int k_unroll()
+ {
+ return 1;
+ }
+
+ // Use the standard fixed size transforms.
+ StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
+
+ kern_type kernel=sve_interleaved_fp32_mla_3VLx8;
+
+ interleaved_fp32_mla_3VLx8(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
new file mode 100644
index 0000000..bb08fc7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ const long loops_count = (K / 2) - 1;
+ const long tails_count = K % 2;
+
+ for (int yb=0; yb<ablocks; yb++) {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ long loops = loops_count;
+ long tails = tails_count;
+
+ __asm __volatile (
+ "mov z8.s, #0\n"
+ "ptrue p0.s\n"
+ "mov z9.s, #0\n"
+ "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
+ "mov z10.s, #0\n"
+ "ld1w z4.s, p0/z, [%[b_ptr]]\n"
+ "mov z11.s, #0\n"
+ "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
+ "mov z12.s, #0\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "mov z13.s, #0\n"
+ "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
+ "mov z14.s, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ "mov z15.s, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
+ "mov z16.s, #0\n"
+ "mov z17.s, #0\n"
+ "mov z18.s, #0\n"
+ "mov z19.s, #0\n"
+ "mov z20.s, #0\n"
+ "mov z21.s, #0\n"
+ "mov z22.s, #0\n"
+ "mov z23.s, #0\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "fmla z8.s, z4.s, z0.s[0]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "fmla z9.s, z4.s, z0.s[1]\n"
+ "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+ "fmla z10.s, z4.s, z0.s[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z11.s, z4.s, z0.s[3]\n"
+ "fmla z20.s, z4.s, z1.s[0]\n"
+ "fmla z21.s, z4.s, z1.s[1]\n"
+ "fmla z22.s, z4.s, z1.s[2]\n"
+ "fmla z23.s, z4.s, z1.s[3]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr]]\n"
+ "fmla z12.s, z5.s, z0.s[0]\n"
+ "fmla z13.s, z5.s, z0.s[1]\n"
+ "fmla z14.s, z5.s, z0.s[2]\n"
+ "fmla z15.s, z5.s, z0.s[3]\n"
+ "fmla z24.s, z5.s, z1.s[0]\n"
+ "fmla z25.s, z5.s, z1.s[1]\n"
+ "fmla z26.s, z5.s, z1.s[2]\n"
+ "fmla z27.s, z5.s, z1.s[3]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "fmla z16.s, z6.s, z0.s[0]\n"
+ "fmla z17.s, z6.s, z0.s[1]\n"
+ "fmla z18.s, z6.s, z0.s[2]\n"
+ "fmla z19.s, z6.s, z0.s[3]\n"
+ "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
+ "fmla z28.s, z6.s, z1.s[0]\n"
+ "fmla z29.s, z6.s, z1.s[1]\n"
+ "fmla z30.s, z6.s, z1.s[2]\n"
+ "fmla z31.s, z6.s, z1.s[3]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ "fmla z8.s, z4.s, z2.s[0]\n"
+ "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
+ "fmla z9.s, z4.s, z2.s[1]\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ "fmla z10.s, z4.s, z2.s[2]\n"
+ "addvl %[b_ptr], %[b_ptr], #6\n"
+ "fmla z11.s, z4.s, z2.s[3]\n"
+ "fmla z20.s, z4.s, z3.s[0]\n"
+ "fmla z21.s, z4.s, z3.s[1]\n"
+ "fmla z22.s, z4.s, z3.s[2]\n"
+ "fmla z23.s, z4.s, z3.s[3]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+ "fmla z12.s, z5.s, z2.s[0]\n"
+ "fmla z13.s, z5.s, z2.s[1]\n"
+ "fmla z14.s, z5.s, z2.s[2]\n"
+ "fmla z15.s, z5.s, z2.s[3]\n"
+ "fmla z24.s, z5.s, z3.s[0]\n"
+ "fmla z25.s, z5.s, z3.s[1]\n"
+ "fmla z26.s, z5.s, z3.s[2]\n"
+ "fmla z27.s, z5.s, z3.s[3]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "fmla z17.s, z6.s, z2.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z19.s, z6.s, z2.s[3]\n"
+ "ld1rqw z2.s, p0/z, [%[a_ptr], #-0x20]\n"
+ "fmla z28.s, z6.s, z3.s[0]\n"
+ "fmla z29.s, z6.s, z3.s[1]\n"
+ "fmla z30.s, z6.s, z3.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[3]\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[tails], 3f\n"
+ "fmla z8.s, z4.s, z0.s[0]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "fmla z9.s, z4.s, z0.s[1]\n"
+ "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+ "fmla z10.s, z4.s, z0.s[2]\n"
+ "fmla z11.s, z4.s, z0.s[3]\n"
+ "fmla z20.s, z4.s, z1.s[0]\n"
+ "fmla z21.s, z4.s, z1.s[1]\n"
+ "fmla z22.s, z4.s, z1.s[2]\n"
+ "fmla z23.s, z4.s, z1.s[3]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr]]\n"
+ "fmla z12.s, z5.s, z0.s[0]\n"
+ "fmla z13.s, z5.s, z0.s[1]\n"
+ "fmla z14.s, z5.s, z0.s[2]\n"
+ "fmla z15.s, z5.s, z0.s[3]\n"
+ "fmla z24.s, z5.s, z1.s[0]\n"
+ "fmla z25.s, z5.s, z1.s[1]\n"
+ "fmla z26.s, z5.s, z1.s[2]\n"
+ "fmla z27.s, z5.s, z1.s[3]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "fmla z16.s, z6.s, z0.s[0]\n"
+ "fmla z17.s, z6.s, z0.s[1]\n"
+ "fmla z18.s, z6.s, z0.s[2]\n"
+ "fmla z19.s, z6.s, z0.s[3]\n"
+ "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
+ "fmla z28.s, z6.s, z1.s[0]\n"
+ "fmla z29.s, z6.s, z1.s[1]\n"
+ "fmla z30.s, z6.s, z1.s[2]\n"
+ "fmla z31.s, z6.s, z1.s[3]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ "fmla z8.s, z4.s, z2.s[0]\n"
+ "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
+ "fmla z9.s, z4.s, z2.s[1]\n"
+ "add %[a_ptr], %[a_ptr], #0x20\n"
+ "fmla z10.s, z4.s, z2.s[2]\n"
+ "addvl %[b_ptr], %[b_ptr], #6\n"
+ "fmla z11.s, z4.s, z2.s[3]\n"
+ "fmla z20.s, z4.s, z3.s[0]\n"
+ "fmla z21.s, z4.s, z3.s[1]\n"
+ "fmla z22.s, z4.s, z3.s[2]\n"
+ "fmla z23.s, z4.s, z3.s[3]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+ "fmla z12.s, z5.s, z2.s[0]\n"
+ "fmla z13.s, z5.s, z2.s[1]\n"
+ "fmla z14.s, z5.s, z2.s[2]\n"
+ "fmla z15.s, z5.s, z2.s[3]\n"
+ "fmla z24.s, z5.s, z3.s[0]\n"
+ "fmla z25.s, z5.s, z3.s[1]\n"
+ "fmla z26.s, z5.s, z3.s[2]\n"
+ "fmla z27.s, z5.s, z3.s[3]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "fmla z17.s, z6.s, z2.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z19.s, z6.s, z2.s[3]\n"
+ "fmla z28.s, z6.s, z3.s[0]\n"
+ "fmla z29.s, z6.s, z3.s[1]\n"
+ "fmla z30.s, z6.s, z3.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[3]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "fmla z8.s, z4.s, z0.s[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
+ "fmla z9.s, z4.s, z0.s[1]\n"
+ "fmla z10.s, z4.s, z0.s[2]\n"
+ "fmla z11.s, z4.s, z0.s[3]\n"
+ "fmla z20.s, z4.s, z1.s[0]\n"
+ "fmla z21.s, z4.s, z1.s[1]\n"
+ "fmla z22.s, z4.s, z1.s[2]\n"
+ "fmla z23.s, z4.s, z1.s[3]\n"
+ "fmla z12.s, z5.s, z0.s[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "fmla z13.s, z5.s, z0.s[1]\n"
+ "fmla z14.s, z5.s, z0.s[2]\n"
+ "fmla z15.s, z5.s, z0.s[3]\n"
+ "fmla z24.s, z5.s, z1.s[0]\n"
+ "fmla z25.s, z5.s, z1.s[1]\n"
+ "fmla z26.s, z5.s, z1.s[2]\n"
+ "fmla z27.s, z5.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z0.s[0]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "fmla z17.s, z6.s, z0.s[1]\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "fmla z18.s, z6.s, z0.s[2]\n"
+ "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "fmla z19.s, z6.s, z0.s[3]\n"
+ "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "fmla z28.s, z6.s, z1.s[0]\n"
+ "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "fmla z29.s, z6.s, z1.s[1]\n"
+ "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "fmla z30.s, z6.s, z1.s[2]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
+ "fmla z31.s, z6.s, z1.s[3]\n"
+ "b 4f\n"
+ "3:\n"
+ "fmla z8.s, z4.s, z0.s[0]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "fmla z9.s, z4.s, z0.s[1]\n"
+ "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+ "fmla z10.s, z4.s, z0.s[2]\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
+ "fmla z11.s, z4.s, z0.s[3]\n"
+ "fmla z20.s, z4.s, z1.s[0]\n"
+ "fmla z21.s, z4.s, z1.s[1]\n"
+ "fmla z22.s, z4.s, z1.s[2]\n"
+ "fmla z23.s, z4.s, z1.s[3]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+ "fmla z12.s, z5.s, z0.s[0]\n"
+ "fmla z13.s, z5.s, z0.s[1]\n"
+ "fmla z14.s, z5.s, z0.s[2]\n"
+ "fmla z15.s, z5.s, z0.s[3]\n"
+ "fmla z24.s, z5.s, z1.s[0]\n"
+ "fmla z25.s, z5.s, z1.s[1]\n"
+ "fmla z26.s, z5.s, z1.s[2]\n"
+ "fmla z27.s, z5.s, z1.s[3]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+ "fmla z16.s, z6.s, z0.s[0]\n"
+ "fmla z17.s, z6.s, z0.s[1]\n"
+ "fmla z18.s, z6.s, z0.s[2]\n"
+ "fmla z19.s, z6.s, z0.s[3]\n"
+ "fmla z28.s, z6.s, z1.s[0]\n"
+ "fmla z29.s, z6.s, z1.s[1]\n"
+ "fmla z30.s, z6.s, z1.s[2]\n"
+ "fmla z31.s, z6.s, z1.s[3]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "fmla z8.s, z4.s, z2.s[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
+ "fmla z9.s, z4.s, z2.s[1]\n"
+ "fmla z10.s, z4.s, z2.s[2]\n"
+ "fmla z11.s, z4.s, z2.s[3]\n"
+ "fmla z20.s, z4.s, z3.s[0]\n"
+ "fmla z21.s, z4.s, z3.s[1]\n"
+ "fmla z22.s, z4.s, z3.s[2]\n"
+ "fmla z23.s, z4.s, z3.s[3]\n"
+ "fmla z12.s, z5.s, z2.s[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "fmla z13.s, z5.s, z2.s[1]\n"
+ "fmla z14.s, z5.s, z2.s[2]\n"
+ "fmla z15.s, z5.s, z2.s[3]\n"
+ "fmla z24.s, z5.s, z3.s[0]\n"
+ "fmla z25.s, z5.s, z3.s[1]\n"
+ "fmla z26.s, z5.s, z3.s[2]\n"
+ "fmla z27.s, z5.s, z3.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "fmla z17.s, z6.s, z2.s[1]\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "fmla z19.s, z6.s, z2.s[3]\n"
+ "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "fmla z28.s, z6.s, z3.s[0]\n"
+ "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "fmla z29.s, z6.s, z3.s[1]\n"
+ "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "fmla z30.s, z6.s, z3.s[2]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
+ "fmla z31.s, z6.s, z3.s[3]\n"
+ "4:\n"
+ "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
+ "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
+ "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
+ "st1w z19.s, p0, [%[c_ptr], #-5, MUL VL]\n"
+ "st1w z20.s, p0, [%[c_ptr], #-4, MUL VL]\n"
+ "st1w z24.s, p0, [%[c_ptr], #-3, MUL VL]\n"
+ "st1w z28.s, p0, [%[c_ptr], #-2, MUL VL]\n"
+ "st1w z21.s, p0, [%[c_ptr], #-1, MUL VL]\n"
+ "st1w z25.s, p0, [%[c_ptr]]\n"
+ "st1w z29.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "st1w z22.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "st1w z26.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1w z30.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1w z23.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1w z27.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z31.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #8\n"
+ : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+ [loops] "+r" (loops), [tails] "+r" (tails)
+ :
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
new file mode 100644
index 0000000..91aa567
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm {
+
+// Actual kernel implementations
+void sve_interleaved_s8s32_dot_3VLx8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+class interleaved_s8s32_dot_3VLx8 {
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_width()
+ {
+ return svcntw() * 3;
+ }
+
+ static int out_height()
+ {
+ return 8;
+ }
+
+ static int k_unroll()
+ {
+ return 4;
+ }
+
+ // Use the standard fixed size transforms.
+ StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
+
+ kern_type kernel=sve_interleaved_s8s32_dot_3VLx8;
+
+ interleaved_s8s32_dot_3VLx8(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
new file mode 100644
index 0000000..2e994a1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+ const int8_t *a_ptr = Apanel;
+ int32_t *c_ptr = Cpanel;
+
+ K /= 4;
+ const long loops_count = (K / 2) - 1;
+ const long tails_count = K % 2;
+
+ for (int yb=0; yb<ablocks; yb++) {
+ const int8_t *a_ptr0 = a_ptr;
+ const int8_t *b_ptr = Bpanel;
+
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ long loops = loops_count;
+ long tails = tails_count;
+
+ __asm __volatile (
+ "mov z8.s, #0\n"
+ "ptrue p0.b\n"
+ "mov z9.s, #0\n"
+ "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
+ "mov z10.s, #0\n"
+ "ld1b z4.b, p0/z, [%[b_ptr]]\n"
+ "mov z11.s, #0\n"
+ "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
+ "mov z12.s, #0\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "mov z13.s, #0\n"
+ "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
+ "mov z14.s, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ "mov z15.s, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
+ "mov z16.s, #0\n"
+ "mov z17.s, #0\n"
+ "mov z18.s, #0\n"
+ "mov z19.s, #0\n"
+ "mov z20.s, #0\n"
+ "mov z21.s, #0\n"
+ "mov z22.s, #0\n"
+ "mov z23.s, #0\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "sdot z8.s, z4.b, z0.b[0]\n"
+ "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "sdot z9.s, z4.b, z0.b[1]\n"
+ "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
+ "sdot z10.s, z4.b, z0.b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z11.s, z4.b, z0.b[3]\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
+ "sdot z21.s, z4.b, z1.b[1]\n"
+ "sdot z22.s, z4.b, z1.b[2]\n"
+ "sdot z23.s, z4.b, z1.b[3]\n"
+ "ld1b z4.b, p0/z, [%[b_ptr]]\n"
+ "sdot z12.s, z5.b, z0.b[0]\n"
+ "sdot z13.s, z5.b, z0.b[1]\n"
+ "sdot z14.s, z5.b, z0.b[2]\n"
+ "sdot z15.s, z5.b, z0.b[3]\n"
+ "sdot z24.s, z5.b, z1.b[0]\n"
+ "sdot z25.s, z5.b, z1.b[1]\n"
+ "sdot z26.s, z5.b, z1.b[2]\n"
+ "sdot z27.s, z5.b, z1.b[3]\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "sdot z16.s, z6.b, z0.b[0]\n"
+ "sdot z17.s, z6.b, z0.b[1]\n"
+ "sdot z18.s, z6.b, z0.b[2]\n"
+ "sdot z19.s, z6.b, z0.b[3]\n"
+ "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
+ "sdot z28.s, z6.b, z1.b[0]\n"
+ "sdot z29.s, z6.b, z1.b[1]\n"
+ "sdot z30.s, z6.b, z1.b[2]\n"
+ "sdot z31.s, z6.b, z1.b[3]\n"
+ "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ "sdot z8.s, z4.b, z2.b[0]\n"
+ "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
+ "sdot z9.s, z4.b, z2.b[1]\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ "sdot z10.s, z4.b, z2.b[2]\n"
+ "addvl %[b_ptr], %[b_ptr], #6\n"
+ "sdot z11.s, z4.b, z2.b[3]\n"
+ "sdot z20.s, z4.b, z3.b[0]\n"
+ "sdot z21.s, z4.b, z3.b[1]\n"
+ "sdot z22.s, z4.b, z3.b[2]\n"
+ "sdot z23.s, z4.b, z3.b[3]\n"
+ "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+ "sdot z12.s, z5.b, z2.b[0]\n"
+ "sdot z13.s, z5.b, z2.b[1]\n"
+ "sdot z14.s, z5.b, z2.b[2]\n"
+ "sdot z15.s, z5.b, z2.b[3]\n"
+ "sdot z24.s, z5.b, z3.b[0]\n"
+ "sdot z25.s, z5.b, z3.b[1]\n"
+ "sdot z26.s, z5.b, z3.b[2]\n"
+ "sdot z27.s, z5.b, z3.b[3]\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "sdot z17.s, z6.b, z2.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z19.s, z6.b, z2.b[3]\n"
+ "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
+ "sdot z28.s, z6.b, z3.b[0]\n"
+ "sdot z29.s, z6.b, z3.b[1]\n"
+ "sdot z30.s, z6.b, z3.b[2]\n"
+ "sdot z31.s, z6.b, z3.b[3]\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[tails], 3f\n"
+ "sdot z8.s, z4.b, z0.b[0]\n"
+ "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "sdot z9.s, z4.b, z0.b[1]\n"
+ "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
+ "sdot z10.s, z4.b, z0.b[2]\n"
+ "sdot z11.s, z4.b, z0.b[3]\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
+ "sdot z21.s, z4.b, z1.b[1]\n"
+ "sdot z22.s, z4.b, z1.b[2]\n"
+ "sdot z23.s, z4.b, z1.b[3]\n"
+ "ld1b z4.b, p0/z, [%[b_ptr]]\n"
+ "sdot z12.s, z5.b, z0.b[0]\n"
+ "sdot z13.s, z5.b, z0.b[1]\n"
+ "sdot z14.s, z5.b, z0.b[2]\n"
+ "sdot z15.s, z5.b, z0.b[3]\n"
+ "sdot z24.s, z5.b, z1.b[0]\n"
+ "sdot z25.s, z5.b, z1.b[1]\n"
+ "sdot z26.s, z5.b, z1.b[2]\n"
+ "sdot z27.s, z5.b, z1.b[3]\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "sdot z16.s, z6.b, z0.b[0]\n"
+ "sdot z17.s, z6.b, z0.b[1]\n"
+ "sdot z18.s, z6.b, z0.b[2]\n"
+ "sdot z19.s, z6.b, z0.b[3]\n"
+ "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
+ "sdot z28.s, z6.b, z1.b[0]\n"
+ "sdot z29.s, z6.b, z1.b[1]\n"
+ "sdot z30.s, z6.b, z1.b[2]\n"
+ "sdot z31.s, z6.b, z1.b[3]\n"
+ "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ "sdot z8.s, z4.b, z2.b[0]\n"
+ "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
+ "sdot z9.s, z4.b, z2.b[1]\n"
+ "add %[a_ptr], %[a_ptr], #0x20\n"
+ "sdot z10.s, z4.b, z2.b[2]\n"
+ "addvl %[b_ptr], %[b_ptr], #6\n"
+ "sdot z11.s, z4.b, z2.b[3]\n"
+ "sdot z20.s, z4.b, z3.b[0]\n"
+ "sdot z21.s, z4.b, z3.b[1]\n"
+ "sdot z22.s, z4.b, z3.b[2]\n"
+ "sdot z23.s, z4.b, z3.b[3]\n"
+ "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+ "sdot z12.s, z5.b, z2.b[0]\n"
+ "sdot z13.s, z5.b, z2.b[1]\n"
+ "sdot z14.s, z5.b, z2.b[2]\n"
+ "sdot z15.s, z5.b, z2.b[3]\n"
+ "sdot z24.s, z5.b, z3.b[0]\n"
+ "sdot z25.s, z5.b, z3.b[1]\n"
+ "sdot z26.s, z5.b, z3.b[2]\n"
+ "sdot z27.s, z5.b, z3.b[3]\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "sdot z17.s, z6.b, z2.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z19.s, z6.b, z2.b[3]\n"
+ "sdot z28.s, z6.b, z3.b[0]\n"
+ "sdot z29.s, z6.b, z3.b[1]\n"
+ "sdot z30.s, z6.b, z3.b[2]\n"
+ "sdot z31.s, z6.b, z3.b[3]\n"
+ "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "sdot z8.s, z4.b, z0.b[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
+ "sdot z9.s, z4.b, z0.b[1]\n"
+ "sdot z10.s, z4.b, z0.b[2]\n"
+ "sdot z11.s, z4.b, z0.b[3]\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
+ "sdot z21.s, z4.b, z1.b[1]\n"
+ "sdot z22.s, z4.b, z1.b[2]\n"
+ "sdot z23.s, z4.b, z1.b[3]\n"
+ "sdot z12.s, z5.b, z0.b[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "sdot z13.s, z5.b, z0.b[1]\n"
+ "sdot z14.s, z5.b, z0.b[2]\n"
+ "sdot z15.s, z5.b, z0.b[3]\n"
+ "sdot z24.s, z5.b, z1.b[0]\n"
+ "sdot z25.s, z5.b, z1.b[1]\n"
+ "sdot z26.s, z5.b, z1.b[2]\n"
+ "sdot z27.s, z5.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z0.b[0]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "sdot z17.s, z6.b, z0.b[1]\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "sdot z18.s, z6.b, z0.b[2]\n"
+ "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "sdot z19.s, z6.b, z0.b[3]\n"
+ "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "sdot z28.s, z6.b, z1.b[0]\n"
+ "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "sdot z29.s, z6.b, z1.b[1]\n"
+ "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "sdot z30.s, z6.b, z1.b[2]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
+ "sdot z31.s, z6.b, z1.b[3]\n"
+ "b 4f\n"
+ "3:\n"
+ "sdot z8.s, z4.b, z0.b[0]\n"
+ "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "sdot z9.s, z4.b, z0.b[1]\n"
+ "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
+ "sdot z10.s, z4.b, z0.b[2]\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
+ "sdot z11.s, z4.b, z0.b[3]\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
+ "sdot z21.s, z4.b, z1.b[1]\n"
+ "sdot z22.s, z4.b, z1.b[2]\n"
+ "sdot z23.s, z4.b, z1.b[3]\n"
+ "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+ "sdot z12.s, z5.b, z0.b[0]\n"
+ "sdot z13.s, z5.b, z0.b[1]\n"
+ "sdot z14.s, z5.b, z0.b[2]\n"
+ "sdot z15.s, z5.b, z0.b[3]\n"
+ "sdot z24.s, z5.b, z1.b[0]\n"
+ "sdot z25.s, z5.b, z1.b[1]\n"
+ "sdot z26.s, z5.b, z1.b[2]\n"
+ "sdot z27.s, z5.b, z1.b[3]\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+ "sdot z16.s, z6.b, z0.b[0]\n"
+ "sdot z17.s, z6.b, z0.b[1]\n"
+ "sdot z18.s, z6.b, z0.b[2]\n"
+ "sdot z19.s, z6.b, z0.b[3]\n"
+ "sdot z28.s, z6.b, z1.b[0]\n"
+ "sdot z29.s, z6.b, z1.b[1]\n"
+ "sdot z30.s, z6.b, z1.b[2]\n"
+ "sdot z31.s, z6.b, z1.b[3]\n"
+ "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "sdot z8.s, z4.b, z2.b[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
+ "sdot z9.s, z4.b, z2.b[1]\n"
+ "sdot z10.s, z4.b, z2.b[2]\n"
+ "sdot z11.s, z4.b, z2.b[3]\n"
+ "sdot z20.s, z4.b, z3.b[0]\n"
+ "sdot z21.s, z4.b, z3.b[1]\n"
+ "sdot z22.s, z4.b, z3.b[2]\n"
+ "sdot z23.s, z4.b, z3.b[3]\n"
+ "sdot z12.s, z5.b, z2.b[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "sdot z13.s, z5.b, z2.b[1]\n"
+ "sdot z14.s, z5.b, z2.b[2]\n"
+ "sdot z15.s, z5.b, z2.b[3]\n"
+ "sdot z24.s, z5.b, z3.b[0]\n"
+ "sdot z25.s, z5.b, z3.b[1]\n"
+ "sdot z26.s, z5.b, z3.b[2]\n"
+ "sdot z27.s, z5.b, z3.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "sdot z17.s, z6.b, z2.b[1]\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "sdot z19.s, z6.b, z2.b[3]\n"
+ "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "sdot z28.s, z6.b, z3.b[0]\n"
+ "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "sdot z29.s, z6.b, z3.b[1]\n"
+ "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "sdot z30.s, z6.b, z3.b[2]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
+ "sdot z31.s, z6.b, z3.b[3]\n"
+ "4:\n"
+ "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
+ "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
+ "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
+ "st1w z19.s, p0, [%[c_ptr], #-5, MUL VL]\n"
+ "st1w z20.s, p0, [%[c_ptr], #-4, MUL VL]\n"
+ "st1w z24.s, p0, [%[c_ptr], #-3, MUL VL]\n"
+ "st1w z28.s, p0, [%[c_ptr], #-2, MUL VL]\n"
+ "st1w z21.s, p0, [%[c_ptr], #-1, MUL VL]\n"
+ "st1w z25.s, p0, [%[c_ptr]]\n"
+ "st1w z29.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "st1w z22.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "st1w z26.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1w z30.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1w z23.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1w z27.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z31.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #8\n"
+ : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+ [loops] "+r" (loops), [tails] "+r" (tails)
+ :
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
new file mode 100644
index 0000000..ef457e4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm {
+
+// Actual kernel implementations
+void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+class interleaved_u8u32_dot_3VLx8 {
+public:
+ typedef uint8_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_width()
+ {
+ return svcntw() * 3;
+ }
+
+ static int out_height()
+ {
+ return 8;
+ }
+
+ static int k_unroll()
+ {
+ return 4;
+ }
+
+ // Use the standard fixed size transforms.
+ StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
+
+ kern_type kernel=sve_interleaved_u8u32_dot_3VLx8;
+
+ interleaved_u8u32_dot_3VLx8(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
new file mode 100644
index 0000000..f4d33a9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+ const uint8_t *a_ptr = Apanel;
+ uint32_t *c_ptr = Cpanel;
+
+ K /= 4;
+ const long loops_count = (K / 2) - 1;
+ const long tails_count = K % 2;
+
+ for (int yb=0; yb<ablocks; yb++) {
+ const uint8_t *a_ptr0 = a_ptr;
+ const uint8_t *b_ptr = Bpanel;
+
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ long loops = loops_count;
+ long tails = tails_count;
+
+ __asm __volatile (
+ "mov z8.s, #0\n"
+ "ptrue p0.b\n"
+ "mov z9.s, #0\n"
+ "mov z10.s, #0\n"
+ "mov z11.s, #0\n"
+ "mov z12.s, #0\n"
+ "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
+ "mov z13.s, #0\n"
+ "ld1b z4.b, p0/z, [%[b_ptr]]\n"
+ "mov z14.s, #0\n"
+ "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
+ "mov z15.s, #0\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
+ "mov z17.s, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ "mov z18.s, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
+ "mov z19.s, #0\n"
+ "mov z20.s, #0\n"
+ "mov z21.s, #0\n"
+ "mov z22.s, #0\n"
+ "mov z23.s, #0\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "udot z8.s, z4.b, z0.b[0]\n"
+ "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "udot z9.s, z4.b, z0.b[1]\n"
+ "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
+ "udot z10.s, z4.b, z0.b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z11.s, z4.b, z0.b[3]\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
+ "udot z21.s, z4.b, z1.b[1]\n"
+ "udot z22.s, z4.b, z1.b[2]\n"
+ "udot z23.s, z4.b, z1.b[3]\n"
+ "ld1b z4.b, p0/z, [%[b_ptr]]\n"
+ "udot z12.s, z5.b, z0.b[0]\n"
+ "udot z13.s, z5.b, z0.b[1]\n"
+ "udot z14.s, z5.b, z0.b[2]\n"
+ "udot z15.s, z5.b, z0.b[3]\n"
+ "udot z24.s, z5.b, z1.b[0]\n"
+ "udot z25.s, z5.b, z1.b[1]\n"
+ "udot z26.s, z5.b, z1.b[2]\n"
+ "udot z27.s, z5.b, z1.b[3]\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "udot z16.s, z6.b, z0.b[0]\n"
+ "udot z17.s, z6.b, z0.b[1]\n"
+ "udot z18.s, z6.b, z0.b[2]\n"
+ "udot z19.s, z6.b, z0.b[3]\n"
+ "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
+ "udot z28.s, z6.b, z1.b[0]\n"
+ "udot z29.s, z6.b, z1.b[1]\n"
+ "udot z30.s, z6.b, z1.b[2]\n"
+ "udot z31.s, z6.b, z1.b[3]\n"
+ "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ "udot z8.s, z4.b, z2.b[0]\n"
+ "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
+ "udot z9.s, z4.b, z2.b[1]\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ "udot z10.s, z4.b, z2.b[2]\n"
+ "addvl %[b_ptr], %[b_ptr], #6\n"
+ "udot z11.s, z4.b, z2.b[3]\n"
+ "udot z20.s, z4.b, z3.b[0]\n"
+ "udot z21.s, z4.b, z3.b[1]\n"
+ "udot z22.s, z4.b, z3.b[2]\n"
+ "udot z23.s, z4.b, z3.b[3]\n"
+ "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+ "udot z12.s, z5.b, z2.b[0]\n"
+ "udot z13.s, z5.b, z2.b[1]\n"
+ "udot z14.s, z5.b, z2.b[2]\n"
+ "udot z15.s, z5.b, z2.b[3]\n"
+ "udot z24.s, z5.b, z3.b[0]\n"
+ "udot z25.s, z5.b, z3.b[1]\n"
+ "udot z26.s, z5.b, z3.b[2]\n"
+ "udot z27.s, z5.b, z3.b[3]\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "udot z17.s, z6.b, z2.b[1]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z19.s, z6.b, z2.b[3]\n"
+ "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
+ "udot z28.s, z6.b, z3.b[0]\n"
+ "udot z29.s, z6.b, z3.b[1]\n"
+ "udot z30.s, z6.b, z3.b[2]\n"
+ "udot z31.s, z6.b, z3.b[3]\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[tails], 3f\n"
+ "udot z8.s, z4.b, z0.b[0]\n"
+ "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "udot z9.s, z4.b, z0.b[1]\n"
+ "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
+ "udot z10.s, z4.b, z0.b[2]\n"
+ "udot z11.s, z4.b, z0.b[3]\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
+ "udot z21.s, z4.b, z1.b[1]\n"
+ "udot z22.s, z4.b, z1.b[2]\n"
+ "udot z23.s, z4.b, z1.b[3]\n"
+ "ld1b z4.b, p0/z, [%[b_ptr]]\n"
+ "udot z12.s, z5.b, z0.b[0]\n"
+ "udot z13.s, z5.b, z0.b[1]\n"
+ "udot z14.s, z5.b, z0.b[2]\n"
+ "udot z15.s, z5.b, z0.b[3]\n"
+ "udot z24.s, z5.b, z1.b[0]\n"
+ "udot z25.s, z5.b, z1.b[1]\n"
+ "udot z26.s, z5.b, z1.b[2]\n"
+ "udot z27.s, z5.b, z1.b[3]\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "udot z16.s, z6.b, z0.b[0]\n"
+ "udot z17.s, z6.b, z0.b[1]\n"
+ "udot z18.s, z6.b, z0.b[2]\n"
+ "udot z19.s, z6.b, z0.b[3]\n"
+ "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
+ "udot z28.s, z6.b, z1.b[0]\n"
+ "udot z29.s, z6.b, z1.b[1]\n"
+ "udot z30.s, z6.b, z1.b[2]\n"
+ "udot z31.s, z6.b, z1.b[3]\n"
+ "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ "udot z8.s, z4.b, z2.b[0]\n"
+ "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
+ "udot z9.s, z4.b, z2.b[1]\n"
+ "add %[a_ptr], %[a_ptr], #0x20\n"
+ "udot z10.s, z4.b, z2.b[2]\n"
+ "addvl %[b_ptr], %[b_ptr], #6\n"
+ "udot z11.s, z4.b, z2.b[3]\n"
+ "udot z20.s, z4.b, z3.b[0]\n"
+ "udot z21.s, z4.b, z3.b[1]\n"
+ "udot z22.s, z4.b, z3.b[2]\n"
+ "udot z23.s, z4.b, z3.b[3]\n"
+ "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+ "udot z12.s, z5.b, z2.b[0]\n"
+ "udot z13.s, z5.b, z2.b[1]\n"
+ "udot z14.s, z5.b, z2.b[2]\n"
+ "udot z15.s, z5.b, z2.b[3]\n"
+ "udot z24.s, z5.b, z3.b[0]\n"
+ "udot z25.s, z5.b, z3.b[1]\n"
+ "udot z26.s, z5.b, z3.b[2]\n"
+ "udot z27.s, z5.b, z3.b[3]\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "udot z17.s, z6.b, z2.b[1]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z19.s, z6.b, z2.b[3]\n"
+ "udot z28.s, z6.b, z3.b[0]\n"
+ "udot z29.s, z6.b, z3.b[1]\n"
+ "udot z30.s, z6.b, z3.b[2]\n"
+ "udot z31.s, z6.b, z3.b[3]\n"
+ "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "udot z8.s, z4.b, z0.b[0]\n"
+ "udot z9.s, z4.b, z0.b[1]\n"
+ "udot z10.s, z4.b, z0.b[2]\n"
+ "udot z11.s, z4.b, z0.b[3]\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
+ "udot z21.s, z4.b, z1.b[1]\n"
+ "udot z22.s, z4.b, z1.b[2]\n"
+ "udot z23.s, z4.b, z1.b[3]\n"
+ "udot z12.s, z5.b, z0.b[0]\n"
+ "udot z13.s, z5.b, z0.b[1]\n"
+ "udot z14.s, z5.b, z0.b[2]\n"
+ "udot z15.s, z5.b, z0.b[3]\n"
+ "udot z24.s, z5.b, z1.b[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "udot z25.s, z5.b, z1.b[1]\n"
+ "udot z26.s, z5.b, z1.b[2]\n"
+ "udot z27.s, z5.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z0.b[0]\n"
+ "udot z17.s, z6.b, z0.b[1]\n"
+ "udot z18.s, z6.b, z0.b[2]\n"
+ "udot z19.s, z6.b, z0.b[3]\n"
+ "udot z28.s, z6.b, z1.b[0]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "udot z29.s, z6.b, z1.b[1]\n"
+ "udot z30.s, z6.b, z1.b[2]\n"
+ "udot z31.s, z6.b, z1.b[3]\n"
+ "b 4f\n"
+ "3:\n"
+ "udot z8.s, z4.b, z0.b[0]\n"
+ "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "udot z9.s, z4.b, z0.b[1]\n"
+ "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
+ "udot z10.s, z4.b, z0.b[2]\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
+ "udot z11.s, z4.b, z0.b[3]\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
+ "udot z21.s, z4.b, z1.b[1]\n"
+ "udot z22.s, z4.b, z1.b[2]\n"
+ "udot z23.s, z4.b, z1.b[3]\n"
+ "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+ "udot z12.s, z5.b, z0.b[0]\n"
+ "udot z13.s, z5.b, z0.b[1]\n"
+ "udot z14.s, z5.b, z0.b[2]\n"
+ "udot z15.s, z5.b, z0.b[3]\n"
+ "udot z24.s, z5.b, z1.b[0]\n"
+ "udot z25.s, z5.b, z1.b[1]\n"
+ "udot z26.s, z5.b, z1.b[2]\n"
+ "udot z27.s, z5.b, z1.b[3]\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+ "udot z16.s, z6.b, z0.b[0]\n"
+ "udot z17.s, z6.b, z0.b[1]\n"
+ "udot z18.s, z6.b, z0.b[2]\n"
+ "udot z19.s, z6.b, z0.b[3]\n"
+ "udot z28.s, z6.b, z1.b[0]\n"
+ "udot z29.s, z6.b, z1.b[1]\n"
+ "udot z30.s, z6.b, z1.b[2]\n"
+ "udot z31.s, z6.b, z1.b[3]\n"
+ "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+ "udot z8.s, z4.b, z2.b[0]\n"
+ "udot z9.s, z4.b, z2.b[1]\n"
+ "udot z10.s, z4.b, z2.b[2]\n"
+ "udot z11.s, z4.b, z2.b[3]\n"
+ "udot z20.s, z4.b, z3.b[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
+ "udot z21.s, z4.b, z3.b[1]\n"
+ "udot z22.s, z4.b, z3.b[2]\n"
+ "udot z23.s, z4.b, z3.b[3]\n"
+ "udot z12.s, z5.b, z2.b[0]\n"
+ "udot z13.s, z5.b, z2.b[1]\n"
+ "udot z14.s, z5.b, z2.b[2]\n"
+ "udot z15.s, z5.b, z2.b[3]\n"
+ "udot z24.s, z5.b, z3.b[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "udot z25.s, z5.b, z3.b[1]\n"
+ "udot z26.s, z5.b, z3.b[2]\n"
+ "udot z27.s, z5.b, z3.b[3]\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "udot z17.s, z6.b, z2.b[1]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z19.s, z6.b, z2.b[3]\n"
+ "udot z28.s, z6.b, z3.b[0]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "udot z29.s, z6.b, z3.b[1]\n"
+ "udot z30.s, z6.b, z3.b[2]\n"
+ "udot z31.s, z6.b, z3.b[3]\n"
+ "4:\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
+ "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
+ "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
+ "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
+ "st1w z19.s, p0, [%[c_ptr], #-5, MUL VL]\n"
+ "st1w z20.s, p0, [%[c_ptr], #-4, MUL VL]\n"
+ "st1w z24.s, p0, [%[c_ptr], #-3, MUL VL]\n"
+ "st1w z28.s, p0, [%[c_ptr], #-2, MUL VL]\n"
+ "st1w z21.s, p0, [%[c_ptr], #-1, MUL VL]\n"
+ "st1w z25.s, p0, [%[c_ptr]]\n"
+ "st1w z29.s, p0, [%[c_ptr], #1, MUL VL]\n"
+ "st1w z22.s, p0, [%[c_ptr], #2, MUL VL]\n"
+ "st1w z26.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1w z30.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1w z23.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1w z27.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z31.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #8\n"
+ : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+ [loops] "+r" (loops), [tails] "+r" (tails)
+ :
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp
index ee32ce7..35d4cc5 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp
@@ -273,7 +273,7 @@
template<>
inline void MergeResults<12, 8>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t alpha, const uint32_t beta) {
// Since the above code uses only MUL and MLA instructions discard the "unsignedness" and proceed safely.
- MergeResults<12, 8>(reinterpret_cast<int32_t*>(out), reinterpret_cast<const int32_t*>(in), ldout, y0, ymax, x0, xmax, static_cast<const int32_t>(alpha), static_cast<const int32_t>(beta));
+ MergeResults<12, 8>(reinterpret_cast<int32_t*>(out), reinterpret_cast<const int32_t*>(in), ldout, y0, ymax, x0, xmax, static_cast<int32_t>(alpha), static_cast<int32_t>(beta));
}
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/merges/list.hpp b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
index d93f1b0..181d1a4 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
@@ -26,3 +26,5 @@
#include "a64_merge_float_to_half_12x8.hpp"
#include "a64_merge_half_24x8.hpp"
#include "a64_merge_int32_12x8.hpp"
+#include "sve_merge_fp32_2VLx8.hpp"
+#include "sve_merge_fp32_3VLx8.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_2VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_2VLx8.hpp
new file mode 100644
index 0000000..7479c8d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_2VLx8.hpp
@@ -0,0 +1,1208 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+template<>
+inline void MergeResults<2, 8, true>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+ const float *inptr = in;
+
+ for (int y=y0; y<ymax; y+=8) {
+ float *outptr0 = out + (y * ldout) + x0;
+ float *outptr1 = outptr0 + ldout;
+ float *outptr2 = outptr1 + ldout;
+ float *outptr3 = outptr2 + ldout;
+ float *outptr4 = outptr3 + ldout;
+ float *outptr5 = outptr4 + ldout;
+ float *outptr6 = outptr5 + ldout;
+ float *outptr7 = outptr6 + ldout;
+
+ const int height = ymax - y;
+
+ for (int i=x0; i<xmax; i+=(2 * get_vector_length<float>())) {
+ if (beta==0.0f)
+ {
+ switch(height) {
+ case 1:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 2:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x40]\n"
+ "addvl %[outptr1], %[outptr1], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 3:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z4.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x40]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #2\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x40]\n"
+ "addvl %[outptr2], %[outptr2], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 4:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x40]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #2\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x40]\n"
+ "addvl %[outptr2], %[outptr2], #2\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x40]\n"
+ "addvl %[outptr3], %[outptr3], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 5:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x40]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z4.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #2\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [x8, #-7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x40]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "addvl %[outptr2], %[outptr2], #2\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x40]\n"
+ "addvl %[outptr3], %[outptr3], #2\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x40]\n"
+ "addvl %[outptr4], %[outptr4], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 6:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z4.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x40]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #2\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z6.s, p0/z, [x8, #-7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x40]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+ "addvl %[outptr2], %[outptr2], #2\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr5], #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x40]\n"
+ "addvl %[outptr3], %[outptr3], #2\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x40]\n"
+ "addvl %[outptr4], %[outptr4], #2\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x40]\n"
+ "addvl %[outptr5], %[outptr5], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 7:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5]]\n"
+ "ld1w z6.s, p0/z, [x8, #-4, MUL VL]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr6]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z4.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x40]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #2\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x40]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "ld1w z4.s, p0/z, [x8, #-5, MUL VL]\n"
+ "addvl %[outptr2], %[outptr2], #2\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr5], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [x8, #-3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x40]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr6], #1, MUL VL]\n"
+ "addvl %[outptr3], %[outptr3], #2\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x40]\n"
+ "addvl %[outptr4], %[outptr4], #2\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x40]\n"
+ "addvl %[outptr5], %[outptr5], #2\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x40]\n"
+ "addvl %[outptr6], %[outptr6], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ default:
+ case 8:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5]]\n"
+ "ld1w z6.s, p0/z, [x8, #-4, MUL VL]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr6]]\n"
+ "ld1w z7.s, p0/z, [x8, #-2, MUL VL]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr7]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x40]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #2\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z4.s, p0/z, [x8, #-7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x40]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
+ "addvl %[outptr2], %[outptr2], #2\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5], #1, MUL VL]\n"
+ "ld1w z6.s, p0/z, [x8, #-3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x40]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr6], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [x8, #-1, MUL VL]\n"
+ "addvl %[outptr3], %[outptr3], #2\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr7], #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x40]\n"
+ "addvl %[outptr4], %[outptr4], #2\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x40]\n"
+ "addvl %[outptr5], %[outptr5], #2\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x40]\n"
+ "addvl %[outptr6], %[outptr6], #2\n"
+ "prfm PSTL1KEEP, [%[outptr7], #0x40]\n"
+ "addvl %[outptr7], %[outptr7], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+
+ }
+ }
+ else
+ {
+ switch(height) {
+ case 1:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z9.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 2:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z9.s, p0/z, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z10.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x40]\n"
+ "addvl %[outptr1], %[outptr1], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 3:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z9.s, p0/z, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z10.s, p0/z, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z11.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x40]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #2\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x40]\n"
+ "addvl %[outptr2], %[outptr2], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 4:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z9.s, p0/z, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z10.s, p0/z, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z11.s, p0/z, [%[outptr3]]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x40]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #2\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x40]\n"
+ "addvl %[outptr2], %[outptr2], #2\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x40]\n"
+ "addvl %[outptr3], %[outptr3], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 5:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z9.s, p0/z, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z10.s, p0/z, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z11.s, p0/z, [%[outptr3]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z8.s, p0/z, [%[outptr4]]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z9.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x40]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #2\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x40]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #-7, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "addvl %[outptr2], %[outptr2], #2\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x40]\n"
+ "addvl %[outptr3], %[outptr3], #2\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x40]\n"
+ "addvl %[outptr4], %[outptr4], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 6:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z9.s, p0/z, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z10.s, p0/z, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z11.s, p0/z, [%[outptr3]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z8.s, p0/z, [%[outptr4]]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "ld1w z9.s, p0/z, [%[outptr5]]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z10.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x40]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #2\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x40]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #-7, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr5], #1, MUL VL]\n"
+ "addvl %[outptr2], %[outptr2], #2\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr5], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x40]\n"
+ "addvl %[outptr3], %[outptr3], #2\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x40]\n"
+ "addvl %[outptr4], %[outptr4], #2\n"
+ "prfm PLDL1KEEP, [%[outptr5], #0x40]\n"
+ "addvl %[outptr5], %[outptr5], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 7:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z9.s, p0/z, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z10.s, p0/z, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z11.s, p0/z, [%[outptr3]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z8.s, p0/z, [%[outptr4]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "ld1w z9.s, p0/z, [%[outptr5]]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5]]\n"
+ "ld1w z10.s, p0/z, [%[outptr6]]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #-4, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr6]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z11.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x40]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #2\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x40]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr5], #1, MUL VL]\n"
+ "addvl %[outptr2], %[outptr2], #2\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-5, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr5], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr6], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x40]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #-3, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr6], #1, MUL VL]\n"
+ "addvl %[outptr3], %[outptr3], #2\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x40]\n"
+ "addvl %[outptr4], %[outptr4], #2\n"
+ "prfm PLDL1KEEP, [%[outptr5], #0x40]\n"
+ "addvl %[outptr5], %[outptr5], #2\n"
+ "prfm PLDL1KEEP, [%[outptr6], #0x40]\n"
+ "addvl %[outptr6], %[outptr6], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ default:
+ case 8:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z9.s, p0/z, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z10.s, p0/z, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z11.s, p0/z, [%[outptr3]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z8.s, p0/z, [%[outptr4]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "ld1w z9.s, p0/z, [%[outptr5]]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5]]\n"
+ "ld1w z10.s, p0/z, [%[outptr6]]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #-4, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr6]]\n"
+ "ld1w z11.s, p0/z, [%[outptr7]]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-2, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr7]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #2\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x40]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #2\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x40]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-7, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr5], #1, MUL VL]\n"
+ "addvl %[outptr2], %[outptr2], #2\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5], #1, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr6], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x40]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #-3, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr6], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr7], #1, MUL VL]\n"
+ "addvl %[outptr3], %[outptr3], #2\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-1, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr7], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x40]\n"
+ "addvl %[outptr4], %[outptr4], #2\n"
+ "prfm PLDL1KEEP, [%[outptr5], #0x40]\n"
+ "addvl %[outptr5], %[outptr5], #2\n"
+ "prfm PLDL1KEEP, [%[outptr6], #0x40]\n"
+ "addvl %[outptr6], %[outptr6], #2\n"
+ "prfm PLDL1KEEP, [%[outptr7], #0x40]\n"
+ "addvl %[outptr7], %[outptr7], #2\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #16\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+
+ }
+ }
+ }
+ }
+}
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
new file mode 100644
index 0000000..27084c3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
@@ -0,0 +1,1564 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+template<>
+inline void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+ const float *inptr = in;
+
+ for (int y=y0; y<ymax; y+=8) {
+ float *outptr0 = out + (y * ldout) + x0;
+ float *outptr1 = outptr0 + ldout;
+ float *outptr2 = outptr1 + ldout;
+ float *outptr3 = outptr2 + ldout;
+ float *outptr4 = outptr3 + ldout;
+ float *outptr5 = outptr4 + ldout;
+ float *outptr6 = outptr5 + ldout;
+ float *outptr7 = outptr6 + ldout;
+
+ const int height = ymax - y;
+
+ for (int i=x0; i<xmax; i+=(3 * get_vector_length<float>())) {
+ if (beta==0.0f)
+ {
+ switch(height) {
+ case 1:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 2:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 3:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z4.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
+ "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 4:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+ "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
+ "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "addvl %[outptr3], %[outptr3], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 5:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z4.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [x8, #-3, MUL VL]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
+ "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
+ "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr3], #2, MUL VL]\n"
+ "ld1w z6.s, p0/z, [x8, #-2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr4], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "addvl %[outptr3], %[outptr3], #3\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "addvl %[outptr4], %[outptr4], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 6:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z4.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z6.s, p0/z, [x8, #-3, MUL VL]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [x8]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr5], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+ "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
+ "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
+ "ld1w z4.s, p0/z, [x8, #-2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4], #2, MUL VL]\n"
+ "ld1w z5.s, p0/z, [x8, #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5], #2, MUL VL]\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "addvl %[outptr3], %[outptr3], #3\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "addvl %[outptr4], %[outptr4], #3\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "addvl %[outptr5], %[outptr5], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 7:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5]]\n"
+ "ld1w z6.s, p0/z, [x8, #2, MUL VL]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr6]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z4.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z6.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [x8, #-3, MUL VL]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "ld1w z4.s, p0/z, [x8]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr5], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [x8, #3, MUL VL]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr6], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
+ "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
+ "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr3], #2, MUL VL]\n"
+ "ld1w z6.s, p0/z, [x8, #-2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr4], #2, MUL VL]\n"
+ "ld1w z7.s, p0/z, [x8, #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr5], #2, MUL VL]\n"
+ "ld1w z4.s, p0/z, [x8, #4, MUL VL]\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr6], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "addvl %[outptr3], %[outptr3], #3\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "addvl %[outptr4], %[outptr4], #3\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "addvl %[outptr5], %[outptr5], #3\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+ "addvl %[outptr6], %[outptr6], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ default:
+ case 8:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5]]\n"
+ "ld1w z6.s, p0/z, [x8, #2, MUL VL]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr6]]\n"
+ "ld1w z7.s, p0/z, [x8, #5, MUL VL]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr7]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z4.s, p0/z, [x8, #-3, MUL VL]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "ld1w z5.s, p0/z, [x8]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5], #1, MUL VL]\n"
+ "ld1w z6.s, p0/z, [x8, #3, MUL VL]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr6], #1, MUL VL]\n"
+ "ld1w z7.s, p0/z, [x8, #6, MUL VL]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr7], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+ "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
+ "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
+ "ld1w z4.s, p0/z, [x8, #-2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul z8.s, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4], #2, MUL VL]\n"
+ "ld1w z5.s, p0/z, [x8, #1, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul z9.s, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5], #2, MUL VL]\n"
+ "ld1w z6.s, p0/z, [x8, #4, MUL VL]\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "fmul z10.s, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr6], #2, MUL VL]\n"
+ "ld1w z7.s, p0/z, [x8, #7, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul z11.s, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr7], #2, MUL VL]\n"
+ "addvl %[outptr3], %[outptr3], #3\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "addvl %[outptr4], %[outptr4], #3\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "addvl %[outptr5], %[outptr5], #3\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+ "addvl %[outptr6], %[outptr6], #3\n"
+ "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
+ "addvl %[outptr7], %[outptr7], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+
+ }
+ }
+ else
+ {
+ switch(height) {
+ case 1:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z9.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z10.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 2:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z9.s, p0/z, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z10.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr1], #2, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 3:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z9.s, p0/z, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z10.s, p0/z, [%[outptr2]]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z11.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z10.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr1], #2, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr2], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 4:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z9.s, p0/z, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z10.s, p0/z, [%[outptr2]]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z11.s, p0/z, [%[outptr3]]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr1], #2, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr2], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr3], #2, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "addvl %[outptr3], %[outptr3], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 5:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z9.s, p0/z, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z10.s, p0/z, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z11.s, p0/z, [%[outptr3]]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z8.s, p0/z, [%[outptr4]]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z9.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #-3, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z10.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr1], #2, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr2], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr3], #2, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr3], #2, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr4], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #-2, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr4], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "addvl %[outptr3], %[outptr3], #3\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+ "addvl %[outptr4], %[outptr4], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 6:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z9.s, p0/z, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z10.s, p0/z, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z11.s, p0/z, [%[outptr3]]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z8.s, p0/z, [%[outptr4]]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "ld1w z9.s, p0/z, [%[outptr5]]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z10.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #-3, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr5], #1, MUL VL]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr5], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr1], #2, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr2], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr3], #2, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr4], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-2, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4], #2, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr5], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #1, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5], #2, MUL VL]\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "addvl %[outptr3], %[outptr3], #3\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+ "addvl %[outptr4], %[outptr4], #3\n"
+ "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+ "addvl %[outptr5], %[outptr5], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ case 7:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z9.s, p0/z, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z10.s, p0/z, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z11.s, p0/z, [%[outptr3]]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z8.s, p0/z, [%[outptr4]]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "ld1w z9.s, p0/z, [%[outptr5]]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5]]\n"
+ "ld1w z10.s, p0/z, [%[outptr6]]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #2, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr6]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z11.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-3, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr5], #1, MUL VL]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr5], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr6], #1, MUL VL]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #3, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr6], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z10.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr1], #2, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr2], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr3], #2, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr3], #2, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr4], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #-2, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr4], #2, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr5], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #1, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr5], #2, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr6], #2, MUL VL]\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #4, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr6], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "addvl %[outptr3], %[outptr3], #3\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+ "addvl %[outptr4], %[outptr4], #3\n"
+ "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+ "addvl %[outptr5], %[outptr5], #3\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
+ "addvl %[outptr6], %[outptr6], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+ default:
+ case 8:
+ {
+ long w = xmax - i;
+ long p = 0;
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "mov z2.s, %s[alpha]\n"
+ "addvl x8, %[inptr], #16\n"
+ "mov z3.s, %s[beta]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0]]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr]]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0]]\n"
+ "ld1w z9.s, p0/z, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1]]\n"
+ "ld1w z10.s, p0/z, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2]]\n"
+ "ld1w z11.s, p0/z, [%[outptr3]]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3]]\n"
+ "ld1w z8.s, p0/z, [%[outptr4]]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4]]\n"
+ "ld1w z9.s, p0/z, [%[outptr5]]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5]]\n"
+ "ld1w z10.s, p0/z, [%[outptr6]]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #2, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr6]]\n"
+ "ld1w z11.s, p0/z, [%[outptr7]]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #5, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr7]]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+ "incw %[p], all, mul #1\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #4, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-6, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-3, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4], #1, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr5], #1, MUL VL]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5], #1, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr6], #1, MUL VL]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #3, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr6], #1, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr7], #1, MUL VL]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #6, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr7], #1, MUL VL]\n"
+ "whilelt p0.s, %[p], %[w]\n"
+ "b.none 1f\n"
+ "ld1w z8.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr1], #2, MUL VL]\n"
+ "addvl %[outptr0], %[outptr0], #3\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr2], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr3], #2, MUL VL]\n"
+ "addvl %[outptr1], %[outptr1], #3\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
+ "ld1w z8.s, p0/z, [%[outptr4], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul z8.s, z8.s, z3.s\n"
+ "ld1w z4.s, p0/z, [x8, #-2, MUL VL]\n"
+ "fmla z8.s, p0/m, z4.s, z2.s\n"
+ "st1w z8.s, p0, [%[outptr4], #2, MUL VL]\n"
+ "ld1w z9.s, p0/z, [%[outptr5], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul z9.s, z9.s, z3.s\n"
+ "ld1w z5.s, p0/z, [x8, #1, MUL VL]\n"
+ "fmla z9.s, p0/m, z5.s, z2.s\n"
+ "st1w z9.s, p0, [%[outptr5], #2, MUL VL]\n"
+ "ld1w z10.s, p0/z, [%[outptr6], #2, MUL VL]\n"
+ "addvl %[outptr2], %[outptr2], #3\n"
+ "fmul z10.s, z10.s, z3.s\n"
+ "ld1w z6.s, p0/z, [x8, #4, MUL VL]\n"
+ "fmla z10.s, p0/m, z6.s, z2.s\n"
+ "st1w z10.s, p0, [%[outptr6], #2, MUL VL]\n"
+ "ld1w z11.s, p0/z, [%[outptr7], #2, MUL VL]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul z11.s, z11.s, z3.s\n"
+ "ld1w z7.s, p0/z, [x8, #7, MUL VL]\n"
+ "fmla z11.s, p0/m, z7.s, z2.s\n"
+ "st1w z11.s, p0, [%[outptr7], #2, MUL VL]\n"
+ "addvl %[outptr3], %[outptr3], #3\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+ "addvl %[outptr4], %[outptr4], #3\n"
+ "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+ "addvl %[outptr5], %[outptr5], #3\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
+ "addvl %[outptr6], %[outptr6], #3\n"
+ "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
+ "addvl %[outptr7], %[outptr7], #3\n"
+ "1:\n"
+ "addvl %[inptr], %[inptr], #24\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr), [p] "+r" (p)
+ : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+ : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+ );
+ }
+ break;
+
+
+ }
+ }
+ }
+ }
+}
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
new file mode 100644
index 0000000..b7323eb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+namespace arm_gemm {
+
+/*
+ * Define "standard" transforms for the blocked GEMMs for SVE.
+ *
+ * This assumes that A is interleaved 'height' ways, B is interleaved
+ * 'width'xVL ways and transposed, and that the merge needs to work in
+ * 'height' x 'width'xVL blocks.
+ *
+ * The optional 'block' parameter is for kernels using dot-product type
+ * instructions like UDOT and SDOT.
+ */
+template<typename TOperand, typename TResult, unsigned int height, unsigned int width_vectors, unsigned int block=1, unsigned int mmla=1>
+class StdTransformsSVE
+{
+public:
+ template<typename TIn>
+ void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
+ const int ymax, const int k0, const int kmax, bool transposed) {
+ if (transposed) {
+ Transform<height, block, true>(out, in, stride, y0, ymax, k0, kmax);
+ } else {
+ Transform<height, block, false>(out, in, stride, y0, ymax, k0, kmax);
+ }
+ }
+
+ template<typename TIn>
+ void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
+ const int xmax, const int k0, const int kmax, bool transposed) {
+ if (transposed) {
+ Transform<width_vectors, block, false, true>(out, in, stride, x0, xmax, k0, kmax);
+ } else {
+ Transform<width_vectors, block, true, true>(out, in, stride, x0, xmax, k0, kmax);
+ }
+ }
+
+ template<typename TOut>
+ void Merge(TOut *out, const TResult *in, int stride, int y0, int ymax, int x0, int xmax, const TOut alpha, const TOut beta) {
+ MergeResults<width_vectors / mmla, height, true>(out, in, stride, y0, ymax, x0, xmax, alpha, beta);
+ }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
index 77d0d87..e422b91 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp
@@ -40,7 +40,7 @@
static void Transform(TOut* out, const TIn* const in, const int stride,
const int y0, const int ymax, const int x0, const int xmax) {
// For SVE cases we multiply the interleave factor by the vector length.
- const unsigned int IntBy = tIntBy * (sve ? get_vector_length<TOut>() : 1);
+ const unsigned int IntBy = tIntBy * (sve ? get_vector_length<TOut>() / BlockBy : 1);
const int n_whole_y_blocks = (ymax - y0) / IntBy;
const int y_remainders = (ymax - y0) % IntBy;
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
index 492abe5..1ccdf60 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
@@ -35,7 +35,7 @@
uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
- uint32_t zerobuff[8];
+ uint32_t zerobuff[16]; // 8 for asm loop plus up to 7 for overflow loop
for (int y=y0; y<ymax; y+=6) {
const uint32_t *inptr0 = inptr + y * ldin + k0;
@@ -137,7 +137,7 @@
: [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
[inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr)
:
- : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12"
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "memory"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
index 91ee492..500ed78 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
@@ -35,7 +35,7 @@
uint16_t *outptr = (uint16_t *)out;
const uint16_t *inptr = (const uint16_t *)in;
- uint16_t zerobuff[24];
+ uint16_t zerobuff[16]; // 8 for asm loop plus up to 7 for overflow loop
for (int y=y0; y<ymax; y+=8) {
const uint16_t *inptr0 = inptr + y * ldin + k0;
@@ -147,7 +147,7 @@
: [skippf] "r" (skippf)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
- "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+ "v25", "v26", "v27", "v28", "v29", "v30", "v31", "memory"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
index 7a32f33..347eafb 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
@@ -35,7 +35,7 @@
uint32_t *outptr = (uint32_t *)out;
const uint32_t *inptr = (uint32_t *)in;
- uint32_t zerobuff[8];
+ uint32_t zerobuff[16]; // 8 for asm loop plus up to 7 for overflow loop
for (int y=y0; y<ymax; y+=8) {
const uint32_t *inptr0 = inptr + y * ldin + k0;
@@ -156,7 +156,7 @@
[inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
:
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
- "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+ "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
index 773d56d..88b40d7 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
@@ -35,7 +35,7 @@
float *outptr = out;
const __fp16 *inptr = in;
- __fp16 zerobuff[8];
+ __fp16 zerobuff[16]; // 8 for asm loop plus up to 7 for overflow loop
for (int y=y0; y<ymax; y+=8) {
const __fp16 *inptr0 = inptr + y * ldin + k0;
@@ -172,7 +172,7 @@
[inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
:
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
- "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+ "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index 8ad5b85..fc1f2c2 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -23,9 +23,15 @@
*/
#include "a32_interleave_6way_32bit.hpp"
#include "a32_transpose_interleave_8way_32bit.hpp"
+#ifdef __ARM_FEATURE_SVE
+#include "sve_interleave_8way_32bit.hpp"
+#include "sve_interleave_8way_block2_32bit.hpp"
+#include "sve_interleave_8way_block4_8bit.hpp"
+#else
+#include "a64_interleave_8way_32bit.hpp"
+#endif
#include "a64_block16_interleave4_8bit.hpp"
#include "a64_interleave_8way_16bit.hpp"
-#include "a64_interleave_8way_32bit.hpp"
#include "a64_interleave_8way_half_to_float.hpp"
#include "a64_transpose_interleave_12way_16bit.hpp"
#include "a64_transpose_interleave_12way_half_to_float.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
new file mode 100644
index 0000000..752e837
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+template<>
+template<typename T>
+inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+ uint32_t *master_outptr = reinterpret_cast<uint32_t *>(out);
+ const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
+
+ for (int y=y0; y<ymax; y+=8)
+ {
+ const int height = ymax-y;
+ const long inwidth = (kmax - k0);
+ const long outwidth = inwidth * 8;
+ long inpos = 0;
+ long outpos = 0;
+
+ uint32_t *outptr = master_outptr;
+ master_outptr += outwidth;
+
+ const uint32_t *inptr0 = inptr + y * ldin + k0;
+ const uint32_t *inptr1 = inptr0 + ldin;
+ const uint32_t *inptr2 = inptr1 + ldin;
+ const uint32_t *inptr3 = inptr2 + ldin;
+ const uint32_t *inptr4 = inptr3 + ldin;
+ const uint32_t *inptr5 = inptr4 + ldin;
+ const uint32_t *inptr6 = inptr5 + ldin;
+ const uint32_t *inptr7 = inptr6 + ldin;
+
+ switch(height)
+ {
+ case 1:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z4.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "incw %[inpos], all, mul #1\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z0.s, z8.s, z4.s\n"
+ "zip2 z1.s, z8.s, z4.s\n"
+ "zip1 z2.s, z9.s, z4.s\n"
+ "zip2 z3.s, z9.s, z4.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z4.s\n"
+ "zip2 z11.s, z1.s, z4.s\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
+ "zip1 z12.s, z2.s, z4.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z14.s, z3.s, z4.s\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+ "zip2 z15.s, z3.s, z4.s\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 2:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z4.s, #0\n"
+ "mov z14.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+ "incw %[inpos], all, mul #1\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z4.s\n"
+ "zip2 z11.s, z1.s, z4.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "zip1 z0.s, z8.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z1.s, z8.s, z4.s\n"
+ "zip1 z2.s, z9.s, z4.s\n"
+ "zip2 z3.s, z9.s, z4.s\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z6.s, z11.s, z14.s\n"
+ "zip2 z7.s, z11.s, z14.s\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 3:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z4.s, #0\n"
+ "mov z14.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+ "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+ "incw %[inpos], all, mul #1\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z4.s\n"
+ "zip2 z11.s, z1.s, z4.s\n"
+ "zip1 z12.s, z2.s, z4.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "zip1 z0.s, z8.s, z12.s\n"
+ "zip2 z1.s, z8.s, z12.s\n"
+ "zip1 z2.s, z9.s, z13.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "zip2 z3.s, z9.s, z13.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "zip1 z6.s, z11.s, z14.s\n"
+ "zip2 z7.s, z11.s, z14.s\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 4:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z4.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+ "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+ "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+ "incw %[inpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z10.s, z1.s, z4.s\n"
+ "zip2 z11.s, z1.s, z4.s\n"
+ "zip1 z12.s, z2.s, z4.s\n"
+ "zip2 z13.s, z2.s, z4.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "zip1 z14.s, z3.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z15.s, z3.s, z4.s\n"
+ "zip1 z0.s, z8.s, z12.s\n"
+ "zip2 z1.s, z8.s, z12.s\n"
+ "zip1 z2.s, z9.s, z13.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "zip2 z3.s, z9.s, z13.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "zip1 z6.s, z11.s, z15.s\n"
+ "zip2 z7.s, z11.s, z15.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 5:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z5.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+ "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+ "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+ "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+ "incw %[inpos], all, mul #1\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z12.s, z2.s, z5.s\n"
+ "zip2 z13.s, z2.s, z5.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "zip1 z14.s, z3.s, z5.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z15.s, z3.s, z5.s\n"
+ "zip1 z0.s, z8.s, z12.s\n"
+ "zip2 z1.s, z8.s, z12.s\n"
+ "zip1 z2.s, z9.s, z13.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "zip2 z3.s, z9.s, z13.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "zip1 z6.s, z11.s, z15.s\n"
+ "zip2 z7.s, z11.s, z15.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 6:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z6.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+ "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+ "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+ "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+ "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
+ "incw %[inpos], all, mul #1\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "zip1 z14.s, z3.s, z6.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z15.s, z3.s, z6.s\n"
+ "zip1 z0.s, z8.s, z12.s\n"
+ "zip2 z1.s, z8.s, z12.s\n"
+ "zip1 z2.s, z9.s, z13.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "zip2 z3.s, z9.s, z13.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "zip1 z6.s, z11.s, z15.s\n"
+ "zip2 z7.s, z11.s, z15.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 7:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z7.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+ "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+ "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+ "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+ "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
+ "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
+ "incw %[inpos], all, mul #1\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "zip1 z0.s, z8.s, z12.s\n"
+ "zip2 z1.s, z8.s, z12.s\n"
+ "zip1 z2.s, z9.s, z13.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "zip2 z3.s, z9.s, z13.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "zip1 z6.s, z11.s, z15.s\n"
+ "zip2 z7.s, z11.s, z15.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ default:
+ case 8:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+ "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+ "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+ "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+ "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+ "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
+ "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
+ "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
+ "incw %[inpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "zip1 z0.s, z8.s, z12.s\n"
+ "zip2 z1.s, z8.s, z12.s\n"
+ "zip1 z2.s, z9.s, z13.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "zip2 z3.s, z9.s, z13.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "zip1 z6.s, z11.s, z15.s\n"
+ "zip2 z7.s, z11.s, z15.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "incw %[outpos], all, mul #1\n"
+ "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+ "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+
+ }
+ }
+}
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp
new file mode 100644
index 0000000..4cc4311
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp
@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+template<>
+template<typename T>
+inline void TransformImpl<8, 2, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+ uint32_t *master_outptr = reinterpret_cast<uint32_t *>(out);
+ const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
+
+ for (int y=y0; y<ymax; y+=8)
+ {
+ const int height = ymax-y;
+ const long inwidth = (kmax - k0);
+ const long outwidth = (inwidth * 8 + 1) / 2;
+ long inpos = 0;
+ long outpos = 0;
+
+ uint32_t *outptr = master_outptr;
+ master_outptr += (outwidth * 2);
+
+ const uint32_t *inptr0 = inptr + y * ldin + k0;
+ const uint32_t *inptr1 = inptr0 + ldin;
+ const uint32_t *inptr2 = inptr1 + ldin;
+ const uint32_t *inptr3 = inptr2 + ldin;
+ const uint32_t *inptr4 = inptr3 + ldin;
+ const uint32_t *inptr5 = inptr4 + ldin;
+ const uint32_t *inptr6 = inptr5 + ldin;
+ const uint32_t *inptr7 = inptr6 + ldin;
+
+ switch(height)
+ {
+ case 1:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z4.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "incw %[inpos], all, mul #1\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
+ "zip1 z0.d, z8.d, z4.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip2 z1.d, z8.d, z4.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z2.d, z9.d, z4.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "zip2 z3.d, z9.d, z4.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "st1d z8.d, p0, [%[outptr]]\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+ "zip1 z10.d, z1.d, z4.d\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "zip2 z11.d, z1.d, z4.d\n"
+ "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "zip1 z12.d, z2.d, z4.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z13.d, z2.d, z4.d\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "zip1 z14.d, z3.d, z4.d\n"
+ "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+ "zip2 z15.d, z3.d, z4.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 2:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z4.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "incw %[inpos], all, mul #1\n"
+ "zip1 z10.d, z1.d, z4.d\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
+ "zip2 z11.d, z1.d, z4.d\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
+ "zip1 z0.d, z8.d, z4.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip2 z1.d, z8.d, z4.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z2.d, z9.d, z4.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "zip2 z3.d, z9.d, z4.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "mov z14.s, #0\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "zip1 z4.d, z10.d, z14.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z5.d, z10.d, z14.d\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "zip1 z6.d, z11.d, z14.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z7.d, z11.d, z14.d\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "st1d z8.d, p0, [%[outptr]]\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+ "zip1 z10.d, z1.d, z5.d\n"
+ "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "zip2 z11.d, z1.d, z5.d\n"
+ "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+ "zip1 z12.d, z2.d, z6.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip2 z13.d, z2.d, z6.d\n"
+ "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "zip1 z14.d, z3.d, z7.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z15.d, z3.d, z7.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 3:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z4.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
+ "zip1 z10.d, z1.d, z4.d\n"
+ "incw %[inpos], all, mul #1\n"
+ "zip2 z11.d, z1.d, z4.d\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
+ "zip1 z12.d, z2.d, z4.d\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
+ "zip2 z13.d, z2.d, z4.d\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
+ "zip1 z0.d, z8.d, z12.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip2 z1.d, z8.d, z12.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z2.d, z9.d, z13.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "zip2 z3.d, z9.d, z13.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "mov z14.s, #0\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "zip1 z4.d, z10.d, z14.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z5.d, z10.d, z14.d\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "zip1 z6.d, z11.d, z14.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z7.d, z11.d, z14.d\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "st1d z8.d, p0, [%[outptr]]\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+ "zip1 z10.d, z1.d, z5.d\n"
+ "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "zip2 z11.d, z1.d, z5.d\n"
+ "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+ "zip1 z12.d, z2.d, z6.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip2 z13.d, z2.d, z6.d\n"
+ "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "zip1 z14.d, z3.d, z7.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z15.d, z3.d, z7.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 4:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z4.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
+ "zip1 z10.d, z1.d, z4.d\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
+ "zip2 z11.d, z1.d, z4.d\n"
+ "incw %[inpos], all, mul #1\n"
+ "zip1 z12.d, z2.d, z4.d\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
+ "zip2 z13.d, z2.d, z4.d\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
+ "zip1 z14.d, z3.d, z4.d\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
+ "zip2 z15.d, z3.d, z4.d\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
+ "zip1 z0.d, z8.d, z12.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip2 z1.d, z8.d, z12.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z2.d, z9.d, z13.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "zip2 z3.d, z9.d, z13.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z4.d, z10.d, z14.d\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "zip2 z5.d, z10.d, z14.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z6.d, z11.d, z15.d\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "zip2 z7.d, z11.d, z15.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "st1d z8.d, p0, [%[outptr]]\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+ "zip1 z10.d, z1.d, z5.d\n"
+ "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "zip2 z11.d, z1.d, z5.d\n"
+ "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+ "zip1 z12.d, z2.d, z6.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip2 z13.d, z2.d, z6.d\n"
+ "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "zip1 z14.d, z3.d, z7.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z15.d, z3.d, z7.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 5:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z5.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "incw %[inpos], all, mul #1\n"
+ "zip1 z10.d, z1.d, z5.d\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
+ "zip2 z11.d, z1.d, z5.d\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
+ "zip1 z12.d, z2.d, z5.d\n"
+ "ld1w z4.s, p0/z, [%[inptr4]]\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
+ "zip2 z13.d, z2.d, z5.d\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
+ "zip1 z14.d, z3.d, z5.d\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
+ "zip2 z15.d, z3.d, z5.d\n"
+ "addvl %[inptr4], %[inptr4], #1\n"
+ "zip1 z0.d, z8.d, z12.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip2 z1.d, z8.d, z12.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z2.d, z9.d, z13.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "zip2 z3.d, z9.d, z13.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z4.d, z10.d, z14.d\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "zip2 z5.d, z10.d, z14.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z6.d, z11.d, z15.d\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "zip2 z7.d, z11.d, z15.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "st1d z8.d, p0, [%[outptr]]\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+ "zip1 z10.d, z1.d, z5.d\n"
+ "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "zip2 z11.d, z1.d, z5.d\n"
+ "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+ "zip1 z12.d, z2.d, z6.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip2 z13.d, z2.d, z6.d\n"
+ "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "zip1 z14.d, z3.d, z7.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z15.d, z3.d, z7.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 6:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z6.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "incw %[inpos], all, mul #1\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
+ "zip1 z12.d, z2.d, z6.d\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
+ "zip2 z13.d, z2.d, z6.d\n"
+ "ld1w z4.s, p0/z, [%[inptr4]]\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "ld1w z5.s, p0/z, [%[inptr5]]\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
+ "zip1 z10.d, z1.d, z5.d\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
+ "zip2 z11.d, z1.d, z5.d\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
+ "zip1 z14.d, z3.d, z6.d\n"
+ "addvl %[inptr4], %[inptr4], #1\n"
+ "zip2 z15.d, z3.d, z6.d\n"
+ "addvl %[inptr5], %[inptr5], #1\n"
+ "zip1 z0.d, z8.d, z12.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip2 z1.d, z8.d, z12.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z2.d, z9.d, z13.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "zip2 z3.d, z9.d, z13.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z4.d, z10.d, z14.d\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "zip2 z5.d, z10.d, z14.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z6.d, z11.d, z15.d\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "zip2 z7.d, z11.d, z15.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "st1d z8.d, p0, [%[outptr]]\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+ "zip1 z10.d, z1.d, z5.d\n"
+ "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "zip2 z11.d, z1.d, z5.d\n"
+ "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+ "zip1 z12.d, z2.d, z6.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip2 z13.d, z2.d, z6.d\n"
+ "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "zip1 z14.d, z3.d, z7.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z15.d, z3.d, z7.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 7:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z7.s, #0\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "incw %[inpos], all, mul #1\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
+ "zip1 z14.d, z3.d, z7.d\n"
+ "ld1w z4.s, p0/z, [%[inptr4]]\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "ld1w z5.s, p0/z, [%[inptr5]]\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "ld1w z6.s, p0/z, [%[inptr6]]\n"
+ "zip1 z10.d, z1.d, z5.d\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
+ "zip2 z11.d, z1.d, z5.d\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
+ "zip1 z12.d, z2.d, z6.d\n"
+ "addvl %[inptr4], %[inptr4], #1\n"
+ "zip2 z13.d, z2.d, z6.d\n"
+ "addvl %[inptr5], %[inptr5], #1\n"
+ "zip2 z15.d, z3.d, z7.d\n"
+ "addvl %[inptr6], %[inptr6], #1\n"
+ "zip1 z0.d, z8.d, z12.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip2 z1.d, z8.d, z12.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z2.d, z9.d, z13.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "zip2 z3.d, z9.d, z13.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z4.d, z10.d, z14.d\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "zip2 z5.d, z10.d, z14.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z6.d, z11.d, z15.d\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "zip2 z7.d, z11.d, z15.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "st1d z8.d, p0, [%[outptr]]\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+ "zip1 z10.d, z1.d, z5.d\n"
+ "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "zip2 z11.d, z1.d, z5.d\n"
+ "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+ "zip1 z12.d, z2.d, z6.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip2 z13.d, z2.d, z6.d\n"
+ "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "zip1 z14.d, z3.d, z7.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z15.d, z3.d, z7.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ default:
+ case 8:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.s, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "incw %[inpos], all, mul #1\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
+ "ld1w z4.s, p0/z, [%[inptr4]]\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "ld1w z5.s, p0/z, [%[inptr5]]\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "ld1w z6.s, p0/z, [%[inptr6]]\n"
+ "zip1 z10.d, z1.d, z5.d\n"
+ "ld1w z7.s, p0/z, [%[inptr7]]\n"
+ "zip2 z11.d, z1.d, z5.d\n"
+ "addvl %[inptr4], %[inptr4], #1\n"
+ "zip1 z12.d, z2.d, z6.d\n"
+ "addvl %[inptr5], %[inptr5], #1\n"
+ "zip2 z13.d, z2.d, z6.d\n"
+ "addvl %[inptr6], %[inptr6], #1\n"
+ "zip1 z14.d, z3.d, z7.d\n"
+ "addvl %[inptr7], %[inptr7], #1\n"
+ "zip2 z15.d, z3.d, z7.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip1 z0.d, z8.d, z12.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z1.d, z8.d, z12.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "zip1 z2.d, z9.d, z13.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z3.d, z9.d, z13.d\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "zip1 z4.d, z10.d, z14.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z5.d, z10.d, z14.d\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "zip1 z6.d, z11.d, z15.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z7.d, z11.d, z15.d\n"
+ "zip1 z8.d, z0.d, z4.d\n"
+ "st1d z8.d, p0, [%[outptr]]\n"
+ "zip2 z9.d, z0.d, z4.d\n"
+ "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+ "zip1 z10.d, z1.d, z5.d\n"
+ "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+ "zip2 z11.d, z1.d, z5.d\n"
+ "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+ "zip1 z12.d, z2.d, z6.d\n"
+ "whilelt p0.d, %[outpos], %[outwidth]\n"
+ "zip2 z13.d, z2.d, z6.d\n"
+ "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+ "zip1 z14.d, z3.d, z7.d\n"
+ "incd %[outpos], all, mul #1\n"
+ "zip2 z15.d, z3.d, z7.d\n"
+ "whilelt p1.d, %[outpos], %[outwidth]\n"
+ "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p2.d, %[outpos], %[outwidth]\n"
+ "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "whilelt p3.d, %[outpos], %[outwidth]\n"
+ "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+ "incd %[outpos], all, mul #1\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+
+ }
+ }
+}
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp
new file mode 100644
index 0000000..f1690ba
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+template<>
+template<typename T>
+inline void TransformImpl<8, 4, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+ uint8_t *master_outptr = reinterpret_cast<uint8_t *>(out);
+ const uint8_t *inptr = reinterpret_cast<const uint8_t *>(in);
+
+ for (int y=y0; y<ymax; y+=8)
+ {
+ const int height = ymax-y;
+ const long inwidth = (kmax - k0);
+ const long outwidth = ((inwidth + 3) / 4) * 32;
+ long inpos = 0;
+ long outpos = 0;
+
+ uint8_t *outptr = master_outptr;
+ master_outptr += outwidth;
+
+ const uint8_t *inptr0 = inptr + y * ldin + k0;
+ const uint8_t *inptr1 = inptr0 + ldin;
+ const uint8_t *inptr2 = inptr1 + ldin;
+ const uint8_t *inptr3 = inptr2 + ldin;
+ const uint8_t *inptr4 = inptr3 + ldin;
+ const uint8_t *inptr5 = inptr4 + ldin;
+ const uint8_t *inptr6 = inptr5 + ldin;
+ const uint8_t *inptr7 = inptr6 + ldin;
+
+ switch(height)
+ {
+ case 1:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.b, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z4.b, #0\n"
+ "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+ "incb %[inpos], all, mul #1\n"
+ "whilelt p0.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "whilelt p1.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z0.s, z8.s, z4.s\n"
+ "zip2 z1.s, z8.s, z4.s\n"
+ "zip1 z2.s, z9.s, z4.s\n"
+ "zip2 z3.s, z9.s, z4.s\n"
+ "whilelt p2.b, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z4.s\n"
+ "zip2 z11.s, z1.s, z4.s\n"
+ "st1b z8.b, p0, [%[outptr]]\n"
+ "zip1 z12.s, z2.s, z4.s\n"
+ "whilelt p3.b, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z14.s, z3.s, z4.s\n"
+ "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+ "zip2 z15.s, z3.s, z4.s\n"
+ "whilelt p4.b, %[outpos], %[outwidth]\n"
+ "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p5.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p6.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+ "whilelt p7.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+ "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 2:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.b, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z4.b, #0\n"
+ "mov z14.b, #0\n"
+ "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+ "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
+ "incb %[inpos], all, mul #1\n"
+ "whilelt p0.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z4.s\n"
+ "zip2 z11.s, z1.s, z4.s\n"
+ "whilelt p1.b, %[outpos], %[outwidth]\n"
+ "zip1 z0.s, z8.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z1.s, z8.s, z4.s\n"
+ "zip1 z2.s, z9.s, z4.s\n"
+ "zip2 z3.s, z9.s, z4.s\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "whilelt p2.b, %[outpos], %[outwidth]\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z6.s, z11.s, z14.s\n"
+ "zip2 z7.s, z11.s, z14.s\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "whilelt p3.b, %[outpos], %[outwidth]\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "st1b z8.b, p0, [%[outptr]]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "whilelt p4.b, %[outpos], %[outwidth]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p5.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+ "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+ "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 3:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.b, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z4.b, #0\n"
+ "mov z14.b, #0\n"
+ "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+ "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
+ "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
+ "incb %[inpos], all, mul #1\n"
+ "whilelt p0.b, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z4.s\n"
+ "zip2 z11.s, z1.s, z4.s\n"
+ "zip1 z12.s, z2.s, z4.s\n"
+ "whilelt p1.b, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "zip1 z0.s, z8.s, z12.s\n"
+ "zip2 z1.s, z8.s, z12.s\n"
+ "zip1 z2.s, z9.s, z13.s\n"
+ "whilelt p2.b, %[outpos], %[outwidth]\n"
+ "zip2 z3.s, z9.s, z13.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "zip1 z6.s, z11.s, z14.s\n"
+ "zip2 z7.s, z11.s, z14.s\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "whilelt p3.b, %[outpos], %[outwidth]\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "st1b z8.b, p0, [%[outptr]]\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "whilelt p4.b, %[outpos], %[outwidth]\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p5.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.b, %[outpos], %[outwidth]\n"
+ "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+ "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 4:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.b, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z4.b, #0\n"
+ "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+ "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
+ "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
+ "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
+ "incb %[inpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "whilelt p0.b, %[outpos], %[outwidth]\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z10.s, z1.s, z4.s\n"
+ "zip2 z11.s, z1.s, z4.s\n"
+ "zip1 z12.s, z2.s, z4.s\n"
+ "zip2 z13.s, z2.s, z4.s\n"
+ "whilelt p1.b, %[outpos], %[outwidth]\n"
+ "zip1 z14.s, z3.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z15.s, z3.s, z4.s\n"
+ "zip1 z0.s, z8.s, z12.s\n"
+ "zip2 z1.s, z8.s, z12.s\n"
+ "zip1 z2.s, z9.s, z13.s\n"
+ "whilelt p2.b, %[outpos], %[outwidth]\n"
+ "zip2 z3.s, z9.s, z13.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "zip1 z6.s, z11.s, z15.s\n"
+ "zip2 z7.s, z11.s, z15.s\n"
+ "whilelt p3.b, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "st1b z8.b, p0, [%[outptr]]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p4.b, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p5.b, %[outpos], %[outwidth]\n"
+ "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+ "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+ "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 5:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.b, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z5.b, #0\n"
+ "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+ "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
+ "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
+ "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
+ "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
+ "incb %[inpos], all, mul #1\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "whilelt p0.b, %[outpos], %[outwidth]\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z12.s, z2.s, z5.s\n"
+ "zip2 z13.s, z2.s, z5.s\n"
+ "whilelt p1.b, %[outpos], %[outwidth]\n"
+ "zip1 z14.s, z3.s, z5.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z15.s, z3.s, z5.s\n"
+ "zip1 z0.s, z8.s, z12.s\n"
+ "zip2 z1.s, z8.s, z12.s\n"
+ "zip1 z2.s, z9.s, z13.s\n"
+ "whilelt p2.b, %[outpos], %[outwidth]\n"
+ "zip2 z3.s, z9.s, z13.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "zip1 z6.s, z11.s, z15.s\n"
+ "zip2 z7.s, z11.s, z15.s\n"
+ "whilelt p3.b, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "st1b z8.b, p0, [%[outptr]]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p4.b, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p5.b, %[outpos], %[outwidth]\n"
+ "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+ "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+ "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 6:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.b, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z6.b, #0\n"
+ "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+ "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
+ "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
+ "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
+ "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
+ "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
+ "incb %[inpos], all, mul #1\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p0.b, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "whilelt p1.b, %[outpos], %[outwidth]\n"
+ "zip1 z14.s, z3.s, z6.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z15.s, z3.s, z6.s\n"
+ "zip1 z0.s, z8.s, z12.s\n"
+ "zip2 z1.s, z8.s, z12.s\n"
+ "zip1 z2.s, z9.s, z13.s\n"
+ "whilelt p2.b, %[outpos], %[outwidth]\n"
+ "zip2 z3.s, z9.s, z13.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "zip1 z6.s, z11.s, z15.s\n"
+ "zip2 z7.s, z11.s, z15.s\n"
+ "whilelt p3.b, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "st1b z8.b, p0, [%[outptr]]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p4.b, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p5.b, %[outpos], %[outwidth]\n"
+ "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+ "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+ "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ case 7:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.b, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "mov z7.b, #0\n"
+ "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+ "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
+ "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
+ "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
+ "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
+ "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
+ "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
+ "incb %[inpos], all, mul #1\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "whilelt p0.b, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p1.b, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "zip1 z0.s, z8.s, z12.s\n"
+ "zip2 z1.s, z8.s, z12.s\n"
+ "zip1 z2.s, z9.s, z13.s\n"
+ "whilelt p2.b, %[outpos], %[outwidth]\n"
+ "zip2 z3.s, z9.s, z13.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "zip1 z6.s, z11.s, z15.s\n"
+ "zip2 z7.s, z11.s, z15.s\n"
+ "whilelt p3.b, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "st1b z8.b, p0, [%[outptr]]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p4.b, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p5.b, %[outpos], %[outwidth]\n"
+ "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+ "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+ "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+ default:
+ case 8:
+ __asm __volatile(
+ "1:\n"
+ "whilelt p0.b, %[inpos], %[inwidth]\n"
+ "b.none 2f\n"
+ "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+ "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
+ "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
+ "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
+ "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
+ "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
+ "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
+ "ld1b z7.b, p0/z, [%[inptr7], %[inpos]]\n"
+ "incb %[inpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "whilelt p0.b, %[outpos], %[outwidth]\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "whilelt p1.b, %[outpos], %[outwidth]\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "zip1 z0.s, z8.s, z12.s\n"
+ "zip2 z1.s, z8.s, z12.s\n"
+ "zip1 z2.s, z9.s, z13.s\n"
+ "whilelt p2.b, %[outpos], %[outwidth]\n"
+ "zip2 z3.s, z9.s, z13.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "zip1 z6.s, z11.s, z15.s\n"
+ "zip2 z7.s, z11.s, z15.s\n"
+ "whilelt p3.b, %[outpos], %[outwidth]\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip2 z9.s, z0.s, z4.s\n"
+ "zip1 z10.s, z1.s, z5.s\n"
+ "zip2 z11.s, z1.s, z5.s\n"
+ "st1b z8.b, p0, [%[outptr]]\n"
+ "zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p4.b, %[outpos], %[outwidth]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "incb %[outpos], all, mul #1\n"
+ "zip1 z14.s, z3.s, z7.s\n"
+ "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p5.b, %[outpos], %[outwidth]\n"
+ "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+ "whilelt p6.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+ "whilelt p7.b, %[outpos], %[outwidth]\n"
+ "incb %[outpos], all, mul #1\n"
+ "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+ "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+ "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+ "addvl %[outptr], %[outptr], #8\n"
+ "b 1b\n"
+ "2:\n"
+ : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
+ : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+ : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ );
+ break;
+
+
+ }
+ }
+}
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index b77bc7a..a1fc00e 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -24,6 +24,10 @@
#pragma once
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
// Macro for unreachable code (e.g. impossible default cases on switch)
#define UNREACHABLE(why) __builtin_unreachable()
@@ -31,23 +35,27 @@
// #define UNREACHABLE(why) assert(0 && why)
inline int iceildiv(const int a, const int b) {
- return (a + b - 1) / b;
+ return (a + b - 1) / b;
}
template <typename T>
inline T roundup(const T a, const T b) {
- T rem = a % b;
+ T rem = a % b;
- if (rem) {
- return a + b - rem;
- } else {
- return a;
- }
+ if (rem) {
+ return a + b - rem;
+ } else {
+ return a;
+ }
}
template <typename T>
inline unsigned long get_vector_length() {
+#ifdef __ARM_FEATURE_SVE
+ const unsigned long length = svcntb();
+#else
const unsigned long length = 16;
+#endif
return length / sizeof(T);
-}
+}
\ No newline at end of file
diff --git a/src/core/NEON/kernels/assembly/Helpers.cpp b/src/core/NEON/kernels/assembly/Helpers.cpp
new file mode 100644
index 0000000..09ac08c
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/Helpers.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
+
+#include "NEGEMMInterleavedStrategies.h"
+
+namespace arm_compute
+{
+namespace
+{
+template <typename InputType, bool use_dot = false>
+BlockSizes calculate_block_sizes_template(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K)
+{
+ using strategy = typename Kernel<InputType, use_dot>::strategy;
+ return calculate_block_sizes<strategy>(ci, M, N, K);
+}
+} // namespace
+
+const char *get_strategy_name(DataType input_type, bool use_dot)
+{
+ switch(input_type)
+ {
+ case DataType::F32:
+ return Kernel<float>::name;
+#ifdef __aarch64__
+ case DataType::U8:
+ case DataType::QASYMM8:
+ if(use_dot)
+ {
+ return Kernel<uint8_t, true>::name;
+ }
+ else
+ {
+ return Kernel<uint8_t, false>::name;
+ }
+ case DataType::S8:
+ if(use_dot)
+ {
+ return Kernel<int8_t, true>::name;
+ }
+ else
+ {
+ return Kernel<int8_t, false>::name;
+ }
+#endif /* __aarch64__ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return Kernel<__fp16>::name;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ default:
+ ARM_COMPUTE_ERROR("DataType not supported");
+ break;
+ }
+}
+
+BlockSizes calculate_block_sizes_from_data_type(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K, DataType input_type, bool use_dot)
+{
+ switch(input_type)
+ {
+ case DataType::F32:
+ return calculate_block_sizes_template<float>(ci, M, N, K);
+#ifdef __aarch64__
+ case DataType::U8:
+ case DataType::QASYMM8:
+ if(use_dot)
+ {
+ return calculate_block_sizes_template<uint8_t, true>(ci, M, N, K);
+ }
+ else
+ {
+ return calculate_block_sizes_template<uint8_t, false>(ci, M, N, K);
+ }
+ case DataType::S8:
+ if(use_dot)
+ {
+ return calculate_block_sizes_template<int8_t, true>(ci, M, N, K);
+ }
+ else
+ {
+ return calculate_block_sizes_template<int8_t, false>(ci, M, N, K);
+ }
+#endif /* __aarch64__ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ return calculate_block_sizes_template<__fp16>(ci, M, N, K);
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ default:
+ ARM_COMPUTE_ERROR("DataType not supported");
+ break;
+ }
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
index c9037ab..0fc3610 100644
--- a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
+++ b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
@@ -49,7 +49,7 @@
p.N = c->info()->tensor_shape().x();
p.K = a->info()->tensor_shape().x();
p.multis = b->info()->tensor_shape().z();
- p.batches = c->info()->tensor_shape().total_size_upper(2) / p.multis;
+ p.batches = c->info()->tensor_shape().total_size_upper(2) / p.multis; //COMPMID-1423: Agree on and document the layout of gemm inputs/outputs
return p;
}
diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp
index 715fe70..2c9cd32 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp
+++ b/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp
@@ -37,7 +37,7 @@
void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker,
const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params ¶ms, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads)
{
- using strategy = typename Kernel<To>::strategy;
+ using strategy = typename Kernel<To, use_dot>::strategy;
_prepared_a = prepared_a;
_transformed_b = transformed_b;
@@ -57,7 +57,7 @@
void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset,
const Coordinates &end_offset)
{
- using strategy = typename Kernel<To>::strategy;
+ using strategy = typename Kernel<To, use_dot>::strategy;
strategy strat(info.cpu_info);
TensorAccessor<To> prepared_a(*_prepared_a);
@@ -98,7 +98,7 @@
template <typename To, typename Tr, bool use_dot>
void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::create_workloads(std::vector<MatrixMultiplyWorkload> &workloads)
{
- using strategy = typename Kernel<To>::strategy;
+ using strategy = typename Kernel<To, use_dot>::strategy;
unsigned int offset_transformed_b = 0;
execute_window_loop(_block_walker, [&](const Coordinates & id)
@@ -127,6 +127,7 @@
});
}
+//TODO: regroup somewhere ?
template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<float, float>;
#ifdef __aarch64__
template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<uint8_t, uint32_t>;
diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp
index f33a14f..6c201ce 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp
+++ b/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp
@@ -89,7 +89,6 @@
// Calculate the total size of the buffer:
size_t total = num_full_k * normal_k_size * (num_full_x * normal_x_size + left_over_x_size);
total += left_over_k_size * (left_over_x_size + num_full_x * normal_x_size);
- total *= sizeof(To);
return total;
}
diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h b/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
index 26a8ade..69842fe 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
+++ b/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
@@ -37,6 +37,10 @@
#include "../arm_gemm/kernels/a64_gemm_u8_4x4.hpp"
#include "../arm_gemm/kernels/a64_hgemm_24x8.hpp"
#include "../arm_gemm/kernels/a64_sgemm_12x8.hpp"
+#include "../arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp"
+#include "../arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp"
+#include "../arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
+#include "../arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
namespace arm_compute
{
@@ -47,48 +51,82 @@
{
};
+#define DEFINE_STRATEGY_SUFFIX(strat, suffix) \
+ using strategy = arm_gemm::strat; \
+ static constexpr const char *name = #strat suffix;
+
+#define DEFINE_STRATEGY(strat) \
+ DEFINE_STRATEGY_SUFFIX(strat, "")
+
+#ifdef __ARM_FEATURE_SVE
+template <>
+struct Kernel<float, false>
+{
+ DEFINE_STRATEGY(interleaved_fp32_mla_3VLx8)
+};
+template <>
+struct Kernel<float16_t, false>
+{
+ DEFINE_STRATEGY(interleaved_fp16_mla_3VLx8)
+};
+template <bool use_dot>
+struct Kernel<int8_t, use_dot>
+{
+ DEFINE_STRATEGY(interleaved_s8s32_dot_3VLx8)
+};
+template <bool use_dot>
+struct Kernel<uint8_t, use_dot>
+{
+ DEFINE_STRATEGY(interleaved_u8u32_dot_3VLx8)
+};
+#else /* __ARM_FEATURE_SVE */
+
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template <>
struct Kernel<float16_t, false>
{
- using strategy = arm_gemm::hgemm_24x8;
+ DEFINE_STRATEGY(hgemm_24x8)
};
#endif /*__ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
#ifdef __aarch64__
template <>
struct Kernel<float, false>
{
- using strategy = arm_gemm::sgemm_12x8;
+ DEFINE_STRATEGY(sgemm_12x8)
};
template <>
struct Kernel<int8_t, false>
{
- using strategy = arm_gemm::gemm_s8_4x4;
+ DEFINE_STRATEGY(gemm_s8_4x4)
};
template <>
struct Kernel<uint8_t, false>
{
- using strategy = arm_gemm::gemm_u8_4x4;
+ DEFINE_STRATEGY(gemm_u8_4x4)
};
//Use different strategies for 8bit dot product:
template <>
struct Kernel<int8_t, true>
{
- using strategy = arm_gemm::gemm_s8_12x8;
+ DEFINE_STRATEGY_SUFFIX(gemm_s8_12x8, "_dot")
};
template <>
struct Kernel<uint8_t, true>
{
- using strategy = arm_gemm::gemm_u8_12x8;
+ DEFINE_STRATEGY_SUFFIX(gemm_u8_12x8, "_dot")
};
#else
template <>
struct Kernel<float, false>
{
- using strategy = arm_gemm::sgemm_8x6;
+ DEFINE_STRATEGY(sgemm_8x6)
};
#endif /* __aarch64__ */
+#endif /* __ARM_FEATURE_SVE */
+
+#undef DEFINE_STRATEGY
+#undef DEFINE_STRATEGY_SUFFIX
} // namespace
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp
index 1780a18..3b80a1f 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp
+++ b/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp
@@ -66,6 +66,7 @@
}
unsigned int last_m = 0;
+ //TODO: Create a new iterate_1D( DimY);
int last_y = -1;
auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id)
{
diff --git a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
index fb217f0..e452dfb 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
+++ b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
@@ -102,6 +102,7 @@
const unsigned int multi = id.z();
const unsigned int ymax = std::min(y0 + strategy::out_height(), m_end);
+ // TODO(COMPMID-1424) : Agree on gemm IO layouts
strat.kernel(a(0, y0, batch, multi), a.stride(Window::DimY),
b(0, 0, multi), b.stride(Window::DimY),
c(0, y0, batch, multi), c.stride(Window::DimY),
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
index 9b3a60d..ca1de26 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
namespace depthwise
{
@@ -43,7 +43,7 @@
float* const outptr,
const int out_row_stride,
const int out_col_stride,
- const int, const int, const int, const int, const int, const int
+ const int, const int, const int, const int, const int, const int, const int, const int
)
{
// Copy pointers
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
index dba2330..9ce43f9 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
namespace depthwise
{
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
index b946e5d..21e8f04 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
namespace depthwise
{
@@ -43,7 +43,7 @@
float* const outptr,
const int out_row_stride,
const int out_col_stride,
- const int, const int, const int, const int, const int, const int
+ const int, const int, const int, const int, const int, const int, const int, const int
)
{
// Copy pointers
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
index 2510941..c7113d0 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
namespace depthwise
{
@@ -43,7 +43,7 @@
float* const outptr,
const int out_row_stride,
const int out_col_stride,
- const int, const int, const int, const int, const int, const int
+ const int, const int, const int, const int, const int, const int, const int, const int
)
{
// Copy pointers
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp
new file mode 100644
index 0000000..33b55df
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_fp16_fp16.hpp"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+ ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+ ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+
+template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
+} // namespace depthwise
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
index 44b93a1..c36c24e 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
namespace depthwise
{
@@ -43,7 +43,7 @@
float* const outptr,
const int out_row_stride,
const int out_col_stride,
- const int, const int, const int, const int, const int, const int
+ const int, const int, const int, const int, const int, const int, const int, const int
)
{
constexpr auto inner_tile_rows = DWC::inner_tile_rows;
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp
new file mode 100644
index 0000000..8f22a64
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_u8_s32.hpp"
+
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+ ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+ ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+
+template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
+} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp
new file mode 100644
index 0000000..09722d0
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_fp16_fp16.hpp"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+
+template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
+} // namespace depthwise
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
index 8eb53a6..05315ee 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
namespace depthwise
{
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp
new file mode 100644
index 0000000..cf51550
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_u8_s32.hpp"
+
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+
+template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
+} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
new file mode 100644
index 0000000..dacfb24
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ *
+ * NOTE: Header to be included by implementation files only.
+ *
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+
+#pragma once
+
+namespace depthwise
+{
+// Partial specialisation for FP16 to FP16
+template <int OutputTileRows, int OutputTileCols,
+ int KernelRows, int KernelCols,
+ int StrideRows, int StrideCols>
+struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float16_t, float16_t>
+{
+ typedef DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float16_t, float16_t
+ > DWC;
+
+ template <
+ bool Specialize=false, // Specialize (or not) the method
+ int InPadTop=0, // If specialized, top padding
+ int InPadLeft=0, // If specialized, left padding
+ int InPadBottom=0, // If specialized, bottom padding
+ int InPadRight=0, // If specialized, right padding
+ int OutPadBottom=0, // If specialized, bottom output padding
+ int OutPadRight=0 // If specialized, bottom right padding
+ >
+ static void process_tile(
+ const int n_channels,
+ const float16_t* const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const float16_t* const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ float16_t* const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int in_pad_top=0,
+ const int in_pad_left=0,
+ const int in_pad_bottom=0,
+ const int in_pad_right=0,
+ const int out_pad_bottom=0,
+ const int out_pad_right=0,
+ const int input_offset=0,
+ const int weights_offset=0
+ );
+};
+
+
+template <int OTR, int OTC, int KR, int KC, int SR, int SC>
+template <
+ bool Specialize,
+ int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
+ int OutPadBottom, int OutPadRight
+>
+void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float16_t, float16_t>::process_tile(
+ const int n_channels,
+ const float16_t *__restrict__ const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const float16_t *__restrict__ const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ float16_t *__restrict__ const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int _in_pad_top,
+ const int _in_pad_left,
+ const int _in_pad_bottom,
+ const int _in_pad_right,
+ const int _out_pad_bottom,
+ const int _out_pad_right,
+ const int _input_offset,
+ const int _weights_offset
+)
+{
+ constexpr auto inner_tile_rows = DWC::inner_tile_rows;
+ constexpr auto inner_tile_cols = DWC::inner_tile_cols;
+ constexpr auto kernel_rows = DWC::kernel_rows;
+ constexpr auto kernel_cols = DWC::kernel_cols;
+ constexpr auto output_tile_rows = DWC::output_tile_rows;
+ constexpr auto output_tile_cols = DWC::output_tile_cols;
+ constexpr auto stride_rows = DWC::stride_rows;
+ constexpr auto stride_cols = DWC::stride_cols;
+
+ // Extract parameters
+ const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
+ const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
+ const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
+ const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
+ const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
+ const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
+
+ // Compute valid ranges of the tile
+ const int in_cells_i = inner_tile_rows - in_pad_bottom;
+ const int in_cells_j = inner_tile_cols - in_pad_right;
+ const int out_cells_i = output_tile_rows - out_pad_bottom;
+ const int out_cells_j = output_tile_cols - out_pad_right;
+
+ // Instantiate pointers
+ const float16_t* __restrict__ inptr_base = inptr;
+ const float16_t* __restrict__ wptr_base = weights;
+ float16_t* __restrict__ outptr_base = outptr;
+
+ // Perform the depthwise convolution
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 8; channels_remaining -= 8)
+ {
+ // Load input tile
+ float16x8_t u[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ const float16_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ if (i < in_pad_top || in_cells_i <= i ||
+ j < in_pad_left || in_cells_j <= j)
+ {
+ u[i][j] = vdupq_n_f16(0.0f);
+ }
+ else
+ {
+ u[i][j] = vld1q_f16(inptr_row + (j - in_pad_left)*in_col_stride);
+ }
+ }
+ }
+ inptr_base += 8;
+
+ // Load weights tile
+ float16x8_t w[kernel_rows][kernel_cols];
+ for (int i = 0; i < kernel_rows; i++)
+ {
+ const float16_t* const wptr_row = wptr_base + i*weight_row_stride;
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[i][j] = vld1q_f16(wptr_row + j*weight_col_stride);
+ }
+ }
+ wptr_base += 8;
+
+ // Perform the convolution
+ float16x8_t v[output_tile_rows][output_tile_cols];
+ for (int out_i = 0; out_i < out_cells_i; out_i++)
+ {
+ for (int out_j = 0; out_j < out_cells_j; out_j++)
+ {
+ // Base co-ordinate
+ const int base_i = out_i * stride_rows;
+ const int base_j = out_j * stride_cols;
+
+ // Fill the accumulator
+ for (int in_i = 0; in_i < kernel_rows; in_i++)
+ {
+ const int i = base_i + in_i;
+ for (int in_j = 0; in_j < kernel_cols; in_j++)
+ {
+ const int j = base_j + in_j;
+ if (in_i == 0 && in_j == 0)
+ {
+ // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
+ v[out_i][out_j] = vmulq_f16(w[in_i][in_j], u[i][j]);
+ }
+ else
+ {
+ // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
+ }
+ }
+ }
+ }
+ }
+
+ // Store the output tile
+ for (int i = 0; i < out_cells_i; i++)
+ {
+ float16_t* const outptr_row = outptr_base + i*out_row_stride;
+ for (int j = 0; j < out_cells_j; j++)
+ {
+ vst1q_f16(outptr_row + j*out_col_stride, v[i][j]);
+ }
+ }
+ outptr_base += 8;
+ }
+#endif // __aarch64__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Load input tile
+ float16_t u[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ const float16_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ if (i < in_pad_top || in_cells_i <= i ||
+ j < in_pad_left || in_cells_j <= j)
+ {
+ u[i][j] = static_cast<float16_t>(0);
+ }
+ else
+ {
+ u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
+ }
+ }
+ }
+ inptr_base++;
+
+ // Load weights tile
+ float16_t w[kernel_rows][kernel_cols];
+ for (int i = 0; i < kernel_rows; i++)
+ {
+ const float16_t* const wptr_row = wptr_base + i*weight_row_stride;
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[i][j] = *(wptr_row + j*weight_col_stride);
+ }
+ }
+ wptr_base++;
+
+ // Perform the convolution
+ float16_t v[output_tile_rows][output_tile_cols];
+ for (int out_i = 0; out_i < out_cells_i; out_i++)
+ {
+ for (int out_j = 0; out_j < out_cells_j; out_j++)
+ {
+ // Clear the accumulator
+ v[out_i][out_j] = static_cast<float16_t>(0);
+
+ // Base co-ordinate
+ const int base_i = out_i * stride_rows;
+ const int base_j = out_j * stride_cols;
+
+ // Fill the accumulator
+ for (int in_i = 0; in_i < kernel_rows; in_i++)
+ {
+ const int i = base_i + in_i;
+ for (int in_j = 0; in_j < kernel_cols; in_j++)
+ {
+ const int j = base_j + in_j;
+ v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ }
+ }
+ }
+ }
+
+ // Store the output tile
+ for (int i = 0; i < out_cells_i; i++)
+ {
+ float16_t* const outptr_row = outptr_base + i*out_row_stride;
+ for (int j = 0; j < out_cells_j; j++)
+ {
+ *(outptr_row + j*out_col_stride) = v[i][j];
+ }
+ }
+ outptr_base++;
+ }
+}
+} // namespace depthwise
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
new file mode 100644
index 0000000..840086f
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ *
+ * NOTE: Header to be included by implementation files only.
+ *
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+
+#pragma once
+
+namespace depthwise
+{
+// Partial specialisation for FP32 to FP32
+template <int OutputTileRows, int OutputTileCols,
+ int KernelRows, int KernelCols,
+ int StrideRows, int StrideCols>
+struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float, float>
+{
+ typedef DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float, float
+ > DWC;
+
+ template <
+ bool Specialize=false, // Specialize (or not) the method
+ int InPadTop=0, // If specialized, top padding
+ int InPadLeft=0, // If specialized, left padding
+ int InPadBottom=0, // If specialized, bottom padding
+ int InPadRight=0, // If specialized, right padding
+ int OutPadBottom=0, // If specialized, bottom output padding
+ int OutPadRight=0 // If specialized, bottom right padding
+ >
+ static void process_tile(
+ const int n_channels,
+ const float* const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const float* const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ float* const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int in_pad_top=0,
+ const int in_pad_left=0,
+ const int in_pad_bottom=0,
+ const int in_pad_right=0,
+ const int out_pad_bottom=0,
+ const int out_pad_right=0,
+ const int input_offset=0,
+ const int weights_offset=0
+ );
+};
+
+
+template <int OTR, int OTC, int KR, int KC, int SR, int SC>
+template <
+ bool Specialize,
+ int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
+ int OutPadBottom, int OutPadRight
+>
+void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float, float>::process_tile(
+ const int n_channels,
+ const float *__restrict__ const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const float *__restrict__ const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ float *__restrict__ const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int _in_pad_top,
+ const int _in_pad_left,
+ const int _in_pad_bottom,
+ const int _in_pad_right,
+ const int _out_pad_bottom,
+ const int _out_pad_right,
+ const int _input_offset,
+ const int _weights_offset
+)
+{
+ constexpr auto inner_tile_rows = DWC::inner_tile_rows;
+ constexpr auto inner_tile_cols = DWC::inner_tile_cols;
+ constexpr auto kernel_rows = DWC::kernel_rows;
+ constexpr auto kernel_cols = DWC::kernel_cols;
+ constexpr auto output_tile_rows = DWC::output_tile_rows;
+ constexpr auto output_tile_cols = DWC::output_tile_cols;
+ constexpr auto stride_rows = DWC::stride_rows;
+ constexpr auto stride_cols = DWC::stride_cols;
+
+ // Extract parameters
+ const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
+ const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
+ const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
+ const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
+ const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
+ const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
+
+ // Compute valid ranges of the tile
+ const int in_cells_i = inner_tile_rows - in_pad_bottom;
+ const int in_cells_j = inner_tile_cols - in_pad_right;
+ const int out_cells_i = output_tile_rows - out_pad_bottom;
+ const int out_cells_j = output_tile_cols - out_pad_right;
+
+ // Instantiate pointers
+ const float* __restrict__ inptr_base = inptr;
+ const float* __restrict__ wptr_base = weights;
+ float* __restrict__ outptr_base = outptr;
+
+ // Perform the depthwise convolution
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
+ {
+ // Load input tile
+ float32x4_t u[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ if (i < in_pad_top || in_cells_i <= i ||
+ j < in_pad_left || in_cells_j <= j)
+ {
+ u[i][j] = vdupq_n_f32(0.0f);
+ }
+ else
+ {
+ u[i][j] = vld1q_f32(inptr_row + (j - in_pad_left)*in_col_stride);
+ }
+ }
+ }
+ inptr_base += 4;
+
+ // Load weights tile
+ float32x4_t w[kernel_rows][kernel_cols];
+ for (int i = 0; i < kernel_rows; i++)
+ {
+ const float* const wptr_row = wptr_base + i*weight_row_stride;
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[i][j] = vld1q_f32(wptr_row + j*weight_col_stride);
+ }
+ }
+ wptr_base += 4;
+
+ // Perform the convolution
+ float32x4_t v[output_tile_rows][output_tile_cols];
+ for (int out_i = 0; out_i < out_cells_i; out_i++)
+ {
+ for (int out_j = 0; out_j < out_cells_j; out_j++)
+ {
+ // Base co-ordinate
+ const int base_i = out_i * stride_rows;
+ const int base_j = out_j * stride_cols;
+
+ // Fill the accumulator
+ for (int in_i = 0; in_i < kernel_rows; in_i++)
+ {
+ const int i = base_i + in_i;
+ for (int in_j = 0; in_j < kernel_cols; in_j++)
+ {
+ const int j = base_j + in_j;
+ if (in_i == 0 && in_j == 0)
+ {
+ // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
+ v[out_i][out_j] = vmulq_f32(w[in_i][in_j], u[i][j]);
+ }
+ else
+ {
+ // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
+ }
+ }
+ }
+ }
+ }
+
+ // Store the output tile
+ for (int i = 0; i < out_cells_i; i++)
+ {
+ float* const outptr_row = outptr_base + i*out_row_stride;
+ for (int j = 0; j < out_cells_j; j++)
+ {
+ vst1q_f32(outptr_row + j*out_col_stride, v[i][j]);
+ }
+ }
+ outptr_base += 4;
+ }
+#endif // __aarch64__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Load input tile
+ float u[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ if (i < in_pad_top || in_cells_i <= i ||
+ j < in_pad_left || in_cells_j <= j)
+ {
+ u[i][j] = static_cast<float>(0);
+ }
+ else
+ {
+ u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
+ }
+ }
+ }
+ inptr_base++;
+
+ // Load weights tile
+ float w[kernel_rows][kernel_cols];
+ for (int i = 0; i < kernel_rows; i++)
+ {
+ const float* const wptr_row = wptr_base + i*weight_row_stride;
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[i][j] = *(wptr_row + j*weight_col_stride);
+ }
+ }
+ wptr_base++;
+
+ // Perform the convolution
+ float v[output_tile_rows][output_tile_cols];
+ for (int out_i = 0; out_i < out_cells_i; out_i++)
+ {
+ for (int out_j = 0; out_j < out_cells_j; out_j++)
+ {
+ // Clear the accumulator
+ v[out_i][out_j] = static_cast<float>(0);
+
+ // Base co-ordinate
+ const int base_i = out_i * stride_rows;
+ const int base_j = out_j * stride_cols;
+
+ // Fill the accumulator
+ for (int in_i = 0; in_i < kernel_rows; in_i++)
+ {
+ const int i = base_i + in_i;
+ for (int in_j = 0; in_j < kernel_cols; in_j++)
+ {
+ const int j = base_j + in_j;
+ v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ }
+ }
+ }
+ }
+
+ // Store the output tile
+ for (int i = 0; i < out_cells_i; i++)
+ {
+ float* const outptr_row = outptr_base + i*out_row_stride;
+ for (int j = 0; j < out_cells_j; j++)
+ {
+ *(outptr_row + j*out_col_stride) = v[i][j];
+ }
+ }
+ outptr_base++;
+ }
+}
+
+} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp
new file mode 100644
index 0000000..d0d8de5
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ *
+ * NOTE: Header to be included by implementation files only.
+ *
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+
+#pragma once
+
+namespace depthwise
+{
+// Partial specialisation for U8 to S32
+template <int OutputTileRows, int OutputTileCols,
+ int KernelRows, int KernelCols,
+ int StrideRows, int StrideCols>
+struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, uint8_t, int32_t>
+{
+ typedef DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ uint8_t, int32_t
+ > DWC;
+
+ template <
+ bool Specialize=false, // Specialize (or not) the method
+ int InPadTop=0, // If specialized, top padding
+ int InPadLeft=0, // If specialized, left padding
+ int InPadBottom=0, // If specialized, bottom padding
+ int InPadRight=0, // If specialized, right padding
+ int OutPadBottom=0, // If specialized, bottom output padding
+ int OutPadRight=0 // If specialized, bottom right padding
+ >
+ static void process_tile(
+ const int n_channels,
+ const uint8_t* const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const uint8_t* const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ int32_t* const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int in_pad_top=0,
+ const int in_pad_left=0,
+ const int in_pad_bottom=0,
+ const int in_pad_right=0,
+ const int out_pad_bottom=0,
+ const int out_pad_right=0,
+ const int input_offset=0,
+ const int weights_offset=0);
+};
+
+
+template <int OTR, int OTC, int KR, int KC, int SR, int SC>
+template <
+ bool Specialize,
+ int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
+ int OutPadBottom, int OutPadRight
+>
+void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, uint8_t, int32_t>::process_tile(
+ const int n_channels,
+ const uint8_t *__restrict__ const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const uint8_t *__restrict__ const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ int32_t *__restrict__ const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int _in_pad_top,
+ const int _in_pad_left,
+ const int _in_pad_bottom,
+ const int _in_pad_right,
+ const int _out_pad_bottom,
+ const int _out_pad_right,
+ const int _input_offset,
+ const int _weights_offset
+)
+{
+ constexpr auto inner_tile_rows = DWC::inner_tile_rows;
+ constexpr auto inner_tile_cols = DWC::inner_tile_cols;
+ constexpr auto kernel_rows = DWC::kernel_rows;
+ constexpr auto kernel_cols = DWC::kernel_cols;
+ constexpr auto output_tile_rows = DWC::output_tile_rows;
+ constexpr auto output_tile_cols = DWC::output_tile_cols;
+ constexpr auto stride_rows = DWC::stride_rows;
+ constexpr auto stride_cols = DWC::stride_cols;
+
+ // Extract parameters
+ const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
+ const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
+ const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
+ const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
+ const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
+ const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
+
+ // Compute valid ranges of the tile
+ const int in_cells_i = inner_tile_rows - in_pad_bottom;
+ const int in_cells_j = inner_tile_cols - in_pad_right;
+ const int out_cells_i = output_tile_rows - out_pad_bottom;
+ const int out_cells_j = output_tile_cols - out_pad_right;
+
+ // Instantiate pointers
+ const uint8_t* __restrict__ inptr_base = inptr;
+ const uint8_t* __restrict__ wptr_base = weights;
+ int32_t* __restrict__ outptr_base = outptr;
+
+ // Perform the depthwise convolution
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ const int32x4_t v_input_offset = vdupq_n_s32(_input_offset);
+ const int32x4_t v_weights_offset = vdupq_n_s32(_weights_offset);
+ for (; channels_remaining >= 16; channels_remaining -= 16)
+ {
+ // Load input tile
+ int32x4x4_t u[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ const uint8_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ if (i < in_pad_top || in_cells_i <= i ||
+ j < in_pad_left || in_cells_j <= j)
+ {
+ u[i][j].val[0] = vdupq_n_s32(0);
+ u[i][j].val[1] = vdupq_n_s32(0);
+ u[i][j].val[2] = vdupq_n_s32(0);
+ u[i][j].val[3] = vdupq_n_s32(0);
+ }
+ else
+ {
+ const uint8x16_t uv = vld1q_u8(inptr_row + (j - in_pad_left)*in_col_stride);
+ u[i][j].val[0] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(uv)))));
+ u[i][j].val[1] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(uv)))));
+ u[i][j].val[2] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(uv)))));
+ u[i][j].val[3] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(uv)))));
+ }
+ }
+ }
+ inptr_base += 16;
+
+ // Load weights tile
+ int32x4x4_t w[kernel_rows][kernel_cols];
+ for (int i = 0; i < kernel_rows; i++)
+ {
+ const uint8_t* const wptr_row = wptr_base + i*weight_row_stride;
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ const uint8x16_t wv = vld1q_u8(wptr_row + j*weight_col_stride);
+ w[i][j].val[0] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(wv)))));
+ w[i][j].val[1] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(wv)))));
+ w[i][j].val[2] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(wv)))));
+ w[i][j].val[3] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(wv)))));
+ }
+ }
+ wptr_base += 16;
+
+ // Perform the convolution
+ int32x4x4_t v[output_tile_rows][output_tile_cols];
+ for (int out_i = 0; out_i < out_cells_i; out_i++)
+ {
+ for (int out_j = 0; out_j < out_cells_j; out_j++)
+ {
+ // Base co-ordinate
+ const int base_i = out_i * stride_rows;
+ const int base_j = out_j * stride_cols;
+
+ // Fill the accumulator
+ for (int in_i = 0; in_i < kernel_rows; in_i++)
+ {
+ const int i = base_i + in_i;
+ for (int in_j = 0; in_j < kernel_cols; in_j++)
+ {
+ const int j = base_j + in_j;
+ if (in_i == 0 && in_j == 0)
+ {
+ // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
+ v[out_i][out_j].val[0] = vmulq_s32(w[in_i][in_j].val[0], u[i][j].val[0]);
+ v[out_i][out_j].val[1] = vmulq_s32(w[in_i][in_j].val[1], u[i][j].val[1]);
+ v[out_i][out_j].val[2] = vmulq_s32(w[in_i][in_j].val[2], u[i][j].val[2]);
+ v[out_i][out_j].val[3] = vmulq_s32(w[in_i][in_j].val[3], u[i][j].val[3]);
+ }
+ else
+ {
+ // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ v[out_i][out_j].val[0] = vmlaq_s32(v[out_i][out_j].val[0], w[in_i][in_j].val[0], u[i][j].val[0]);
+ v[out_i][out_j].val[1] = vmlaq_s32(v[out_i][out_j].val[1], w[in_i][in_j].val[1], u[i][j].val[1]);
+ v[out_i][out_j].val[2] = vmlaq_s32(v[out_i][out_j].val[2], w[in_i][in_j].val[2], u[i][j].val[2]);
+ v[out_i][out_j].val[3] = vmlaq_s32(v[out_i][out_j].val[3], w[in_i][in_j].val[3], u[i][j].val[3]);
+ }
+ }
+ }
+ }
+ }
+
+ // Store the output tile
+ for (int i = 0; i < out_cells_i; i++)
+ {
+ int32_t* const outptr_row = outptr_base + i*out_row_stride;
+ for (int j = 0; j < out_cells_j; j++)
+ {
+ vst1q_s32(outptr_row + j*out_col_stride, v[i][j].val[0]);
+ vst1q_s32(outptr_row + j*out_col_stride + 4, v[i][j].val[1]);
+ vst1q_s32(outptr_row + j*out_col_stride + 8, v[i][j].val[2]);
+ vst1q_s32(outptr_row + j*out_col_stride + 12, v[i][j].val[3]);
+ }
+ }
+ outptr_base += 16;
+ }
+#endif // __aarch64__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Load input tile
+ int32_t u[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ const uint8_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ if (i < in_pad_top || in_cells_i <= i ||
+ j < in_pad_left || in_cells_j <= j)
+ {
+ u[i][j] = static_cast<uint8_t>(0);
+ }
+ else
+ {
+ u[i][j] = static_cast<int32_t >(*(inptr_row + (j - in_pad_left)*in_col_stride)) + _input_offset;
+ }
+ }
+ }
+ inptr_base++;
+
+ // Load weights tile
+ int32_t w[kernel_rows][kernel_cols];
+ for (int i = 0; i < kernel_rows; i++)
+ {
+ const uint8_t* const wptr_row = wptr_base + i*weight_row_stride;
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[i][j] = static_cast<int32_t >(*(wptr_row + j*weight_col_stride)) + _weights_offset;
+ }
+ }
+ wptr_base++;
+
+ // Perform the convolution
+ int32_t v[output_tile_rows][output_tile_cols];
+ for (int out_i = 0; out_i < out_cells_i; out_i++)
+ {
+ for (int out_j = 0; out_j < out_cells_j; out_j++)
+ {
+ // Clear the accumulator
+ v[out_i][out_j] = static_cast<int32_t>(0);
+
+ // Base co-ordinate
+ const int base_i = out_i * stride_rows;
+ const int base_j = out_j * stride_cols;
+
+ // Fill the accumulator
+ for (int in_i = 0; in_i < kernel_rows; in_i++)
+ {
+ const int i = base_i + in_i;
+ for (int in_j = 0; in_j < kernel_cols; in_j++)
+ {
+ const int j = base_j + in_j;
+ v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ }
+ }
+ }
+ }
+
+ // Store the output tile
+ for (int i = 0; i < out_cells_i; i++)
+ {
+ int32_t* const outptr_row = outptr_base + i*out_row_stride;
+ for (int j = 0; j < out_cells_j; j++)
+ {
+ *(outptr_row + j*out_col_stride) = v[i][j];
+ }
+ }
+ outptr_base++;
+ }
+}
+
+} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp
new file mode 100644
index 0000000..e66300d
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+
+namespace
+{
+
+template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
+void winograd_input_transform_1x8_fp32_process_tile(
+ int n_channels,
+ const float* const input_base,
+ const int input_row_stride,
+ const int input_col_stride,
+ float* const matrix_base,
+ const int matrix_stride,
+ const int _pad_top,
+ const int _pad_left,
+ const int _pad_bottom,
+ const int _pad_right
+)
+{
+ (void) input_row_stride; // No rows over which to stride
+ (void) _pad_top; // Never any top padding
+ (void) _pad_bottom; // Never any bottom padding
+
+ // Extract padding arguments
+ const int pad_left = Specialized ? PadLeft : _pad_left;
+ const int pad_right = Specialized ? PadRight : _pad_right;
+
+ constexpr int inner_tile_cols = 8;
+ const int cells_j = inner_tile_cols - pad_right;
+
+ float *outptr = matrix_base;
+
+ // Get pointers into the input tile
+ const float *x_ptrs[inner_tile_cols];
+ for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
+ {
+ x_ptrs[j] = input_base + xj*input_col_stride;
+ }
+
+ // Vectors used/computed in this kernel.
+ float x[inner_tile_cols];
+ float U[inner_tile_cols];
+
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[j] = 0.0f;
+ }
+
+ // Perform the Winograd input transformation for each channel in the input
+ // tensor.
+ int channels_remaining = n_channels;
+#ifdef __arm_any__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
+ {
+ float32x4_t x[inner_tile_cols], U[inner_tile_cols];
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[j] = vdupq_n_f32(0.0f);
+ }
+
+ // Load x
+ for (int j = pad_left; j < cells_j; j++)
+ {
+ x[j] = vld1q_f32(x_ptrs[j]);
+ x_ptrs[j] += 4;
+ }
+
+ // Compute U = x . X
+ U[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
+ U[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
+ U[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
+ U[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
+ U[4] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
+ U[5] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
+ U[6] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
+ U[7] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
+
+ // Store the transformed vector
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ vst1q_f32(outptr + j*matrix_stride, U[j]);
+ }
+ outptr += 4;
+ }
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ float32x2_t x[inner_tile_cols], U[inner_tile_cols];
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[j] = vdup_n_f32(0.0f);
+ }
+
+ // Load x
+ for (int j = pad_left; j < cells_j; j++)
+ {
+ x[j] = vld1_f32(x_ptrs[j]);
+ x_ptrs[j] += 2;
+ }
+
+ // Compute U = x . X
+ U[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
+ U[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
+ U[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
+ U[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
+ U[4] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
+ U[5] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
+ U[6] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
+ U[7] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
+
+ // Store the transformed vector
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ vst1_f32(outptr + j*matrix_stride, U[j]);
+ }
+ outptr += 2;
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Load x
+ for (int j = pad_left; j < cells_j; j++)
+ {
+ x[j] = *(x_ptrs[j]++);
+ }
+
+ // Compute U = x . X
+ U[0] = x[0]*-36 + x[4]*-14 + x[2]*49 + x[6]*1;
+ U[1] = x[5]*-1 + x[1]*-36 + x[4]*-13 + x[3]*13 + x[2]*36 + x[6]*1;
+ U[2] = x[3]*-13 + x[4]*-13 + x[1]*36 + x[2]*36 + x[5]*1 + x[6]*1;
+ U[3] = x[1]*-18 + x[4]*-10 + x[5]*-2 + x[2]*9 + x[3]*20 + x[6]*1;
+ U[4] = x[3]*-20 + x[4]*-10 + x[5]*2 + x[2]*9 + x[1]*18 + x[6]*1;
+ U[5] = x[1]*-12 + x[4]*-5 + x[5]*-3 + x[2]*4 + x[3]*15 + x[6]*1;
+ U[6] = x[3]*-15 + x[4]*-5 + x[5]*3 + x[2]*4 + x[1]*12 + x[6]*1;
+ U[7] = x[1]*-36 + x[5]*-14 + x[3]*49 + x[7]*1;
+
+ // Store the transformed vector
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ *(outptr + j*matrix_stride) = U[j];
+ }
+ outptr++;
+ }
+}
+
+}
+
+namespace winograd
+{
+template <int x>
+using Tiles = InputTransformImplTiles<1, x, 1, 8, float>;
+
+/*****************************************************************************/
+// 1x3 specialisations
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_left_padded[n_pad_left] = {
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 1, 0, 0>,
+};
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_right_padded[n_pad_right] = {
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
+};
+/*****************************************************************************/
+
+/*****************************************************************************/
+// 1x5 specialisations
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_left_padded[n_pad_left] = {
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 2, 0, 0>,
+};
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_right_padded[n_pad_right] = {
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
+};
+/*****************************************************************************/
+
+/*****************************************************************************/
+// 1x7 specialisations
+template <>
+const Tiles<7>::TileFn Tiles<7>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
+
+template <>
+const Tiles<7>::TileFn Tiles<7>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
+
+template <>
+const Tiles<7>::TileFn Tiles<7>::tilefn_left_padded[n_pad_left] = {
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 1, 0, 0>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 3, 0, 0>,
+};
+
+template <>
+const Tiles<7>::TileFn Tiles<7>::tilefn_right_padded[n_pad_right] = {
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
+ winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
+};
+/*****************************************************************************/
+
+
+template class InputTransform<1, 3, 1, 8, float>;
+template class InputTransform<3, 1, 8, 1, float>;
+template class InputTransform<1, 5, 1, 8, float>;
+template class InputTransform<5, 1, 8, 1, float>;
+template class InputTransform<1, 7, 1, 8, float>;
+template class InputTransform<7, 1, 8, 1, float>;
+} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp
index 6d8afc0..4203945 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp
@@ -29,91 +29,36 @@
namespace winograd
{
-using Transform = WinogradGEMM<2, 2, 3, 3>::InputTransform<float>;
+using Tiles = InputTransformImplTiles<3, 3, 4, 4, float>;
-/******************************************************************************
- * Cost methods for the input transform.
- * =====================================
- */
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &input_shape)
+namespace
{
- // NOTE: Cost in FLOPs rather than instructions or uops.
- const int tile_M = iceildiv(input_shape.n_rows, inner_tile_rows);
- const int tile_N = iceildiv(input_shape.n_cols, inner_tile_cols);
- return 16 * 16 * tile_M * tile_N * input_shape.n_channels;
-}
-/*****************************************************************************/
-/*****************************************************************************
-* F(2x2, 3x3) implies the use of a 4x4 input tile. Such tiles can require a
-* variety of padding types. For example, tiles at the top and left of an image
-* can require one row or column of padding on their top and left sides if the
-* padding type is SAME (where X represents a padded value):
-*
-* _______ _______
-* |X X X X| |X X X X|
-* |X | | | . . .
-* |X | | |
-* |X______| |_______|
-* _______
-* |X | .
-* |X | . . . .
-* |X | .
-* |X______|
-*
-* For tiles near the right or bottom of the image it is more complicated. Such
-* tiles might require padding by 0 or 1 rows or columns if the padding type is
-* VALID or 1 or 2 rows or columns if the padding type is SAME:
-*
-* _______ _______ _______ _______
-* |X X X X| |X X X X| |X X X X| |X X X X|
-* |X | | | | X| | X X|
-* |X | | | | X| | X X|
-* |X______| |_______| |______X| |____X_X|
-* _______ _______ _______ _______
-* |X | | | | X| | X X|
-* |X | | | | X| | X X|
-* |X | | | | X| | X X|
-* |X______| |_______| |______X| |____X_X|
-* _______ _______ _______ _______
-* |X | | | | X| | X X|
-* |X | | | | X| | X X|
-* |X | | | | X| | X X|
-* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X|
-* _______ _______ _______ _______
-* |X | | | | X| | X X|
-* |X | | | | X| | X X|
-* |X X X X| |X X X X| |X X X X| |X X X X|
-* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X|
-*
-* Additional tiles are required for especially small input images.
-*
-* Build an array of the specialised methods that deal with each of the
-* different padding combinations which may be required. These padding
-* constraints are the space:
-*
-* Padding top in {0, 1}
-* Padding left in {0, 1}
-* Padding bottom in {0, 1, 2}
-* Padding right in {0, 1, 2}
-*/
-template <>
-template <>
-template <int pad_top, int pad_left, int pad_bottom, int pad_right>
-void Transform::process_tile(
+
+template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
+void winograd_input_transform_4x4_fp32_process_tile(
int n_channels,
const float* const input_base,
const int input_row_stride,
const int input_col_stride,
float* const matrix_base,
- const int matrix_stride
-)
+ const int matrix_stride,
+ const int _pad_top,
+ const int _pad_left,
+ const int _pad_bottom,
+ const int _pad_right
+ )
{
+const int pad_top = Specialized ? PadTop : _pad_top;
+ const int pad_left = Specialized ? PadLeft : _pad_left;
+ const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+ const int pad_right = Specialized ? PadRight : _pad_right;
+
constexpr int inner_tile_i = 4, inner_tile_j = 4;
- constexpr int cells_i = inner_tile_i - pad_bottom;
- constexpr int cells_j = inner_tile_i - pad_right;
+ const int cells_i = inner_tile_i - pad_bottom;
+ const int cells_j = inner_tile_i - pad_right;
+
+
float *outptr = matrix_base;
@@ -327,83 +272,40 @@
}
}
+} // namespace (anonymous)
+
template <>
+const Tiles::TileFn Tiles::tilefn_generic = winograd_input_transform_4x4_fp32_process_tile<false>;
+
template <>
-const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] =
-{
- {
- {
- {
- Transform::template process_tile<0, 0, 0, 0>, // No padding
- Transform::template process_tile<0, 0, 0, 1>, // Right
- Transform::template process_tile<0, 0, 0, 2>, // Right
- },
- {
- Transform::template process_tile<0, 0, 1, 0>, // Bottom
- Transform::template process_tile<0, 0, 1, 1>, // Bottom-right
- Transform::template process_tile<0, 0, 1, 2>, // Bottom-right
- },
- {
- Transform::template process_tile<0, 0, 2, 0>, // Bottom
- Transform::template process_tile<0, 0, 2, 1>, // Bottom-right
- Transform::template process_tile<0, 0, 2, 2>, // Bottom-right
- }
- },
- {
- {
- Transform::template process_tile<0, 1, 0, 0>, // Left
- Transform::template process_tile<0, 1, 0, 1>, // Left AND right
- Transform::template process_tile<0, 1, 0, 2>, // Left AND right
- },
- {
- Transform::template process_tile<0, 1, 1, 0>, // Left-bottom
- Transform::template process_tile<0, 1, 1, 1>, // Left, bottom AND right
- Transform::template process_tile<0, 1, 1, 2>, // Left, bottom AND right
- },
- {
- Transform::template process_tile<0, 1, 2, 0>, // Left-bottom
- Transform::template process_tile<0, 1, 2, 1>, // Left, bottom AND right
- Transform::template process_tile<0, 1, 2, 2>, // Left, bottom AND right
- }
- },
- },
- {
- {
- {
- Transform::template process_tile<1, 0, 0, 0>, // Top
- Transform::template process_tile<1, 0, 0, 1>, // Top-right
- Transform::template process_tile<1, 0, 0, 2>, // Top-right
- },
- {
- Transform::template process_tile<1, 0, 1, 0>, // Top AND bottom
- Transform::template process_tile<1, 0, 1, 1>, // Top, bottom AND right
- Transform::template process_tile<1, 0, 1, 2>, // Top, bottom AND right
- },
- {
- Transform::template process_tile<1, 0, 2, 0>, // Top AND bottom
- Transform::template process_tile<1, 0, 2, 1>, // Top, bottom AND right
- Transform::template process_tile<1, 0, 2, 2>, // Top, bottom AND right
- }
- },
- {
- {
- Transform::template process_tile<1, 1, 0, 0>, // Top-left
- Transform::template process_tile<1, 1, 0, 1>, // Top, left AND right
- Transform::template process_tile<1, 1, 0, 2>, // Top, left AND right
- },
- {
- Transform::template process_tile<1, 1, 1, 0>, // Top, left AND bottom
- Transform::template process_tile<1, 1, 1, 1>, // All padded
- Transform::template process_tile<1, 1, 1, 2>, // All padded
- },
- {
- Transform::template process_tile<1, 1, 2, 0>, // Top, left AND bottom
- Transform::template process_tile<1, 1, 2, 1>, // All padded
- Transform::template process_tile<1, 1, 2, 2>, // All padded
- }
- }
- }
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_input_transform_4x4_fp32_process_tile<true>;
+
+
+template <>
+const Tiles::TileFn Tiles::tilefn_top_padded[n_pad_top] = {
+ winograd_input_transform_4x4_fp32_process_tile<true, 1, 0, 0, 0>,
};
-template struct WinogradGEMM<2, 2, 3, 3>::InputTransform<float>;
+template <>
+const Tiles::TileFn Tiles::tilefn_left_padded[n_pad_left] = {
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 1, 0, 0>,
+};
+
+template <>
+const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 1, 0>,
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 2, 0>,
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 3, 0>,
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 4, 0>,
+};
+
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 1>,
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 2>,
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 3>,
+ winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 4>,
+};
+
+template class InputTransform<3, 3, 4, 4, float>;
} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp
deleted file mode 100644
index ebc0c07..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp
+++ /dev/null
@@ -1,458 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace winograd
-{
-
-using Transform = WinogradGEMM<2, 2, 5, 5>::InputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &input_shape)
-{
- (void) input_shape;
- return 0;
-}
-
-/*****************************************************************************
-* F(2x2, 5x5) implies the use of a 6x6 input tile.
-*
-* Build an array of the specialised methods that deal with each of the
-* different padding combinations which may be required. These padding
-* constraints are the space:
-*
-* Padding top in {0, 2}
-* Padding left in {0, 2}
-* Padding bottom in {0, 1, 2, 3, 4}
-* Padding right in {0, 1, 2, 3, 4}
-*/
-template <>
-template <>
-template <int pad_top, int pad_left, int pad_bottom, int pad_right>
-void Transform::process_tile(
- int n_channels,
- const float* const input_base,
- const int input_row_stride,
- const int input_col_stride,
- float* const matrix_base,
- const int matrix_stride
-)
-{
- constexpr int cells_i = 6 - pad_bottom;
- constexpr int cells_j = 6 - pad_right;
-
- float *outptr = matrix_base;
-
- // Get pointers into the input tile
- const float *x_ptrs[6][6];
- for (int i = pad_top, xi = 0; i < cells_i; i++, xi++)
- {
- // Get a pointer into the row
- const float* const row_ptr = input_base + xi*input_row_stride;
-
- for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
- {
- x_ptrs[i][j] = row_ptr + xj*input_col_stride;
- }
- }
-
- // Matrices used/computed in this kernel.
- float x[6][6], XTx[6][6], U[6][6];
- for (int i = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++)
- {
- x[i][j] = XTx[i][j] = 0.0f;
- }
- }
-
- // Perform the Winograd input transformation for each channel in the input
- // tensor.
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- // Matrices used/computed in this kernel
- float32x4_t x[6][6], XTx[6][6], U[6][6];
- for (int i = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++)
- {
- x[i][j] = vdupq_n_f32(0.0f);
- XTx[i][j] = vdupq_n_f32(0.0f);
- }
- }
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = pad_top; i < cells_i; i++)
- {
- for (int j = pad_left; j < cells_j; j++)
- {
- x[i][j] = vld1q_f32(x_ptrs[i][j]);
- x_ptrs[i][j] += 4;
- }
- }
-
- // Compute XT . x
- for (int j = pad_left; j < cells_j; j++)
- {
- // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
- XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
-
- // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
- XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f);
-
- // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
- XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f);
-
- // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
- XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f);
-
- // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
- XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f);
-
- // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
- XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
- }
-
- // Compute U = XT . x . X
- for (int i = 0; i < 6; i++)
- {
- // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
- U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
-
- // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
- U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
- // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
- U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
- // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
- U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f);
-
- // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
- U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f);
-
- // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
- U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
- }
-
- // Store the transformed matrix
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- vst1q_f32(outptr + m*matrix_stride, U[i][j]);
- }
- }
- outptr += 4;
- }
-#endif // __aarch64__
-#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used/computed in this kernel
- float32x2_t x[6][6], XTx[6][6], U[6][6];
- for (int i = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++)
- {
- x[i][j] = vdup_n_f32(0.0f);
- XTx[i][j] = vdup_n_f32(0.0f);
- }
- }
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = pad_top; i < cells_i; i++)
- {
- for (int j = pad_left; j < cells_j; j++)
- {
- x[i][j] = vld1_f32(x_ptrs[i][j]);
- x_ptrs[i][j] += 2;
- }
- }
-
- // Compute XT . x
- for (int j = pad_left; j < cells_j; j++)
- {
- // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
- XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
-
- // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
- XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
-
- // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
- XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
-
- // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
- XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
-
- // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
- XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
-
- // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
- XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
- }
-
- // Compute U = XT . x . X
- for (int i = 0; i < 6; i++)
- {
- // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
- U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
-
- // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
- U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
- // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
- U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
- // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
- U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
-
- // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
- U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
-
- // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
- U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
- }
-
- // Store the transformed matrix
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- vst1_f32(outptr + m*matrix_stride, U[i][j]);
- }
- }
- outptr += 2;
- }
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
- {
- // Load x
- for (int i = pad_top; i < cells_i; i++)
- {
- for (int j = pad_left; j < cells_j; j++)
- {
- x[i][j] = *(x_ptrs[i][j]++);
- }
- }
-
- // Compute XT . x
- for (int j = pad_left; j < cells_j; j++)
- {
- XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
- XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
- XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
- XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
- XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
- XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
- }
-
- // Compute U = XT . x . X
- for (int i = 0; i < 6; i++)
- {
- U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
- U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
- U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
- U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
- U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
- U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
- }
-
- // Store the transformed matrix
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- *(outptr + m*matrix_stride) = U[i][j];
- }
- }
- outptr++;
- }
-}
-
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] =
-{
- {
- {
- {
- Transform::template process_tile<0, 0, 0, 0>, // No padding
- Transform::template process_tile<0, 0, 0, 1>, // Right
- Transform::template process_tile<0, 0, 0, 2>, // " "
- Transform::template process_tile<0, 0, 0, 3>, // " "
- Transform::template process_tile<0, 0, 0, 4>, // " "
- },
- {
- Transform::template process_tile<0, 0, 1, 0>, // Bottom
- Transform::template process_tile<0, 0, 1, 1>, // Bottom right
- Transform::template process_tile<0, 0, 1, 2>, // " "
- Transform::template process_tile<0, 0, 1, 3>, // " "
- Transform::template process_tile<0, 0, 1, 4>, // " "
- },
- {
- Transform::template process_tile<0, 0, 2, 0>, // Bottom
- Transform::template process_tile<0, 0, 2, 1>, // Bottom right
- Transform::template process_tile<0, 0, 2, 2>, // " "
- Transform::template process_tile<0, 0, 2, 3>, // " "
- Transform::template process_tile<0, 0, 2, 4>, // " "
- },
- {
- Transform::template process_tile<0, 0, 3, 0>, // Bottom
- Transform::template process_tile<0, 0, 3, 1>, // Bottom right
- Transform::template process_tile<0, 0, 3, 2>, // " "
- Transform::template process_tile<0, 0, 3, 3>, // " "
- Transform::template process_tile<0, 0, 3, 4>, // " "
- },
- {
- Transform::template process_tile<0, 0, 4, 0>, // Bottom
- Transform::template process_tile<0, 0, 4, 1>, // Bottom right
- Transform::template process_tile<0, 0, 4, 2>, // " "
- Transform::template process_tile<0, 0, 4, 3>, // " "
- Transform::template process_tile<0, 0, 4, 4>, // " "
- }
- },
- {
- {
- Transform::template process_tile<0, 2, 0, 0>, // Left
- Transform::template process_tile<0, 2, 0, 1>,
- Transform::template process_tile<0, 2, 0, 2>,
- Transform::template process_tile<0, 2, 0, 3>,
- Transform::template process_tile<0, 2, 0, 4>,
- },
- {
- Transform::template process_tile<0, 2, 1, 0>, // Bottom left
- Transform::template process_tile<0, 2, 1, 1>,
- Transform::template process_tile<0, 2, 1, 2>,
- Transform::template process_tile<0, 2, 1, 3>,
- Transform::template process_tile<0, 2, 1, 4>,
- },
- {
- Transform::template process_tile<0, 2, 2, 0>, // " "
- Transform::template process_tile<0, 2, 2, 1>,
- Transform::template process_tile<0, 2, 2, 2>,
- Transform::template process_tile<0, 2, 2, 3>,
- Transform::template process_tile<0, 2, 2, 4>,
- },
- {
- Transform::template process_tile<0, 2, 3, 0>, // " "
- Transform::template process_tile<0, 2, 3, 1>,
- Transform::template process_tile<0, 2, 3, 2>,
- Transform::template process_tile<0, 2, 3, 3>,
- Transform::template process_tile<0, 2, 3, 4>,
- },
- {
- Transform::template process_tile<0, 2, 4, 0>, // " "
- Transform::template process_tile<0, 2, 4, 1>,
- Transform::template process_tile<0, 2, 4, 2>,
- Transform::template process_tile<0, 2, 4, 3>,
- Transform::template process_tile<0, 2, 4, 4>,
- }
- }
- },
- {
- {
- {
- Transform::template process_tile<2, 0, 0, 0>, // Top
- Transform::template process_tile<2, 0, 0, 1>, // Top right
- Transform::template process_tile<2, 0, 0, 2>, // " "
- Transform::template process_tile<2, 0, 0, 3>, // " "
- Transform::template process_tile<2, 0, 0, 4>, // " "
- },
- {
- Transform::template process_tile<2, 0, 1, 0>,
- Transform::template process_tile<2, 0, 1, 1>,
- Transform::template process_tile<2, 0, 1, 2>,
- Transform::template process_tile<2, 0, 1, 3>,
- Transform::template process_tile<2, 0, 1, 4>,
- },
- {
- Transform::template process_tile<2, 0, 2, 0>,
- Transform::template process_tile<2, 0, 2, 1>,
- Transform::template process_tile<2, 0, 2, 2>,
- Transform::template process_tile<2, 0, 2, 3>,
- Transform::template process_tile<2, 0, 2, 4>,
- },
- {
- Transform::template process_tile<2, 0, 3, 0>,
- Transform::template process_tile<2, 0, 3, 1>,
- Transform::template process_tile<2, 0, 3, 2>,
- Transform::template process_tile<2, 0, 3, 3>,
- Transform::template process_tile<2, 0, 3, 4>,
- },
- {
- Transform::template process_tile<2, 0, 4, 0>,
- Transform::template process_tile<2, 0, 4, 1>,
- Transform::template process_tile<2, 0, 4, 2>,
- Transform::template process_tile<2, 0, 4, 3>,
- Transform::template process_tile<2, 0, 4, 4>,
- },
- },
- {
- {
- Transform::template process_tile<2, 2, 0, 0>, // Top left
- Transform::template process_tile<2, 2, 0, 1>,
- Transform::template process_tile<2, 2, 0, 2>,
- Transform::template process_tile<2, 2, 0, 3>,
- Transform::template process_tile<2, 2, 0, 4>,
- },
- {
- Transform::template process_tile<2, 2, 1, 0>,
- Transform::template process_tile<2, 2, 1, 1>,
- Transform::template process_tile<2, 2, 1, 2>,
- Transform::template process_tile<2, 2, 1, 3>,
- Transform::template process_tile<2, 2, 1, 4>,
- },
- {
- Transform::template process_tile<2, 2, 2, 0>,
- Transform::template process_tile<2, 2, 2, 1>,
- Transform::template process_tile<2, 2, 2, 2>,
- Transform::template process_tile<2, 2, 2, 3>,
- Transform::template process_tile<2, 2, 2, 4>,
- },
- {
- Transform::template process_tile<2, 2, 3, 0>,
- Transform::template process_tile<2, 2, 3, 1>,
- Transform::template process_tile<2, 2, 3, 2>,
- Transform::template process_tile<2, 2, 3, 3>,
- Transform::template process_tile<2, 2, 3, 4>,
- },
- {
- Transform::template process_tile<2, 2, 4, 0>,
- Transform::template process_tile<2, 2, 4, 1>,
- Transform::template process_tile<2, 2, 4, 2>,
- Transform::template process_tile<2, 2, 4, 3>,
- Transform::template process_tile<2, 2, 4, 4>,
- }
- }
- }
-};
-
-template struct WinogradGEMM<2, 2, 5, 5>::InputTransform<float>;
-} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp
deleted file mode 100644
index 04d1573..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace winograd
-{
-
-using Transform = WinogradGEMM<4, 4, 3, 3>::InputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &input_shape)
-{
- // NOTE: Cost in FLOPs rather than instructions or uops.
- const int tile_M = iceildiv(input_shape.n_rows, inner_tile_rows);
- const int tile_N = iceildiv(input_shape.n_cols, inner_tile_cols);
- return 12 * 24 * tile_M * tile_N * input_shape.n_channels;
-}
-
-/* F(4x4, 3x3) implies the use of a 6x6 input tile. Such tiles can require a
-* variety of padding types. For example, tiles at the top and left of an
-* image can require one row or column of padding on their top and left sides
-* if the padding type is SAME (where X represents a padded value):
-*
-* ___________ ___________
-* |X X X X X X| |X X X X X X|
-* |X | | |
-* |X | | |
-* |X | | |
-* |X | | |
-* |X__________| |___________|
-* ___________
-* |X |
-* |X |
-* |X |
-* |X |
-* |X |
-* |X__________|
-*
-* For tiles near the right or bottom of the image it is more complicated.
-* Such tiles might require padding by 0, 1, 2 or 3 rows or columns if the
-* padding type is VALID or 1, 2, 3 or 4 rows or columns if the padding
-* type is SAME.
-*
-* Build an array of the specialised methods that deal with each of the
-* different padding combinations which may be required. These padding
-* constraints are the space:
-*
-* Padding top in {0, 1}
-* Padding left in {0, 1}
-* Padding bottom in {0, 1, 2, 3, 4}
-* Padding right in {0, 1, 2, 3, 4}
-*/
-template <>
-template <>
-template <int pad_top, int pad_left, int pad_bottom, int pad_right>
-void Transform::process_tile(
- int n_channels,
- const float* const input_base,
- const int input_row_stride,
- const int input_col_stride,
- float* const matrix_base,
- const int matrix_stride
-)
-{
- constexpr int cells_i = 6 - pad_bottom;
- constexpr int cells_j = 6 - pad_right;
-
- float *outptr = matrix_base;
-
- // Get pointers into the input tile
- const float *x_ptrs[6][6];
- for (int i = pad_top, xi = 0; i < cells_i; i++, xi++)
- {
- // Get a pointer into the row
- const float* const row_ptr = input_base + xi*input_row_stride;
-
- for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
- {
- x_ptrs[i][j] = row_ptr + xj*input_col_stride;
- }
- }
-
- // Matrices used/computed in this kernel.
- float x[6][6], XTx[6][6], U[6][6];
- for (int i = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++)
- {
- x[i][j] = XTx[i][j] = 0.0f;
- }
- }
-
- // Perform the Winograd input transformation for each channel in the input
- // tensor.
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
- {
- // Matrices used/computed in this kernel
- float32x4_t x[6][6], XTx[6][6], U[6][6];
- for (int i = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++)
- {
- x[i][j] = vdupq_n_f32(0.0f);
- XTx[i][j] = vdupq_n_f32(0.0f);
- }
- }
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = pad_top; i < cells_i; i++)
- {
- for (int j = pad_left; j < cells_j; j++)
- {
- x[i][j] = vld1q_f32(x_ptrs[i][j]);
- x_ptrs[i][j] += 4;
- }
- }
-
- // Compute XT . x
- for (int j = pad_left; j < cells_j; j++)
- {
- // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
- XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
-
- // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
- XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f);
-
- // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
- XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f);
-
- // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
- XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f);
-
- // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
- XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f);
-
- // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
- XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
- }
-
- // Compute U = XT . x . X
- for (int i = 0; i < 6; i++)
- {
- // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
- U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
-
- // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
- U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
- // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
- U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
- // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
- U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f);
-
- // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
- U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f);
-
- // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
- U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
- }
-
- // Store the transformed matrix
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- vst1q_f32(outptr + m*matrix_stride, U[i][j]);
- }
- }
- outptr += 4;
- }
-#endif // __aarch64__
-#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used/computed in this kernel
- float32x2_t x[6][6], XTx[6][6], U[6][6];
- for (int i = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++)
- {
- x[i][j] = vdup_n_f32(0.0f);
- XTx[i][j] = vdup_n_f32(0.0f);
- }
- }
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = pad_top; i < cells_i; i++)
- {
- for (int j = pad_left; j < cells_j; j++)
- {
- x[i][j] = vld1_f32(x_ptrs[i][j]);
- x_ptrs[i][j] += 2;
- }
- }
-
- // Compute XT . x
- for (int j = pad_left; j < cells_j; j++)
- {
- // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
- XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
-
- // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
- XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
-
- // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
- XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
-
- // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
- XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
-
- // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
- XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
-
- // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
- XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
- }
-
- // Compute U = XT . x . X
- for (int i = 0; i < 6; i++)
- {
- // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
- U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
-
- // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
- U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
- // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
- U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
- // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
- U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
-
- // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
- U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
-
- // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
- U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
- }
-
- // Store the transformed matrix
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- vst1_f32(outptr + m*matrix_stride, U[i][j]);
- }
- }
- outptr += 2;
- }
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
- {
- // Load x
- for (int i = pad_top; i < cells_i; i++)
- {
- for (int j = pad_left; j < cells_j; j++)
- {
- x[i][j] = *(x_ptrs[i][j]++);
- }
- }
-
- // Compute XT . x
- for (int j = pad_left; j < cells_j; j++)
- {
- XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
- XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
- XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
- XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
- XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
- XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
- }
-
- // Compute U = XT . x . X
- for (int i = 0; i < 6; i++)
- {
- U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
- U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
- U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
- U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
- U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
- U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
- }
-
- // Store the transformed matrix
- for (int i = 0, m = 0; i < 6; i++)
- {
- for (int j = 0; j < 6; j++, m++)
- {
- *(outptr + m*matrix_stride) = U[i][j];
- }
- }
- outptr++;
- }
-}
-
-/* In the below, unusual or especially small tiles are routed via the slow
- * path whereas common or large tiles are routed through a faster path.
- */
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] =
-{
- {
- {
- {
- Transform::template process_tile<0, 0, 0, 0>, // No padding
- Transform::template process_tile<0, 0, 0, 1>, // Right
- Transform::template process_tile<0, 0, 0, 2>, // " "
- Transform::template process_tile<0, 0, 0, 3>, // " "
- Transform::template process_tile<0, 0, 0, 4>, // " "
- },
- {
- Transform::template process_tile<0, 0, 1, 0>, // Bottom
- Transform::template process_tile<0, 0, 1, 1>, // Bottom right
- Transform::template process_tile<0, 0, 1, 2>, // " "
- Transform::template process_tile<0, 0, 1, 3>, // " "
- Transform::template process_tile<0, 0, 1, 4>, // " "
- },
- {
- Transform::template process_tile<0, 0, 2, 0>, // Bottom
- Transform::template process_tile<0, 0, 2, 1>, // Bottom right
- Transform::template process_tile<0, 0, 2, 2>, // " "
- Transform::template process_tile<0, 0, 2, 3>, // " "
- Transform::template process_tile<0, 0, 2, 4>, // " "
- },
- {
- Transform::template process_tile<0, 0, 3, 0>, // Bottom
- Transform::template process_tile<0, 0, 3, 1>, // Bottom right
- Transform::template process_tile<0, 0, 3, 2>, // " "
- Transform::template process_tile<0, 0, 3, 3>, // " "
- Transform::template process_tile<0, 0, 3, 4>, // " "
- },
- {
- Transform::template process_tile<0, 0, 4, 0>, // Bottom
- Transform::template process_tile<0, 0, 4, 1>, // Bottom right
- Transform::template process_tile<0, 0, 4, 2>, // " "
- Transform::template process_tile<0, 0, 4, 3>, // " "
- Transform::template process_tile<0, 0, 4, 4>, // " "
- }
- },
- {
- {
- Transform::template process_tile<0, 1, 0, 0>, // Left
- Transform::template process_tile<0, 1, 0, 1>,
- Transform::template process_tile<0, 1, 0, 2>,
- Transform::template process_tile<0, 1, 0, 3>,
- Transform::template process_tile<0, 1, 0, 4>,
- },
- {
- Transform::template process_tile<0, 1, 1, 0>, // Bottom left
- Transform::template process_tile<0, 1, 1, 1>,
- Transform::template process_tile<0, 1, 1, 2>,
- Transform::template process_tile<0, 1, 1, 3>,
- Transform::template process_tile<0, 1, 1, 4>,
- },
- {
- Transform::template process_tile<0, 1, 2, 0>, // " "
- Transform::template process_tile<0, 1, 2, 1>,
- Transform::template process_tile<0, 1, 2, 2>,
- Transform::template process_tile<0, 1, 2, 3>,
- Transform::template process_tile<0, 1, 2, 4>,
- },
- {
- Transform::template process_tile<0, 1, 3, 0>, // " "
- Transform::template process_tile<0, 1, 3, 1>,
- Transform::template process_tile<0, 1, 3, 2>,
- Transform::template process_tile<0, 1, 3, 3>,
- Transform::template process_tile<0, 1, 3, 4>,
- },
- {
- Transform::template process_tile<0, 1, 4, 0>, // " "
- Transform::template process_tile<0, 1, 4, 1>,
- Transform::template process_tile<0, 1, 4, 2>,
- Transform::template process_tile<0, 1, 4, 3>,
- Transform::template process_tile<0, 1, 4, 4>,
- }
- }
- },
- {
- {
- {
- Transform::template process_tile<1, 0, 0, 0>, // Top
- Transform::template process_tile<1, 0, 0, 1>, // Top right
- Transform::template process_tile<1, 0, 0, 2>, // " "
- Transform::template process_tile<1, 0, 0, 3>, // " "
- Transform::template process_tile<1, 0, 0, 4>, // " "
- },
- {
- Transform::template process_tile<1, 0, 1, 0>,
- Transform::template process_tile<1, 0, 1, 1>,
- Transform::template process_tile<1, 0, 1, 2>,
- Transform::template process_tile<1, 0, 1, 3>,
- Transform::template process_tile<1, 0, 1, 4>,
- },
- {
- Transform::template process_tile<1, 0, 2, 0>,
- Transform::template process_tile<1, 0, 2, 1>,
- Transform::template process_tile<1, 0, 2, 2>,
- Transform::template process_tile<1, 0, 2, 3>,
- Transform::template process_tile<1, 0, 2, 4>,
- },
- {
- Transform::template process_tile<1, 0, 3, 0>,
- Transform::template process_tile<1, 0, 3, 1>,
- Transform::template process_tile<1, 0, 3, 2>,
- Transform::template process_tile<1, 0, 3, 3>,
- Transform::template process_tile<1, 0, 3, 4>,
- },
- {
- Transform::template process_tile<1, 0, 4, 0>,
- Transform::template process_tile<1, 0, 4, 1>,
- Transform::template process_tile<1, 0, 4, 2>,
- Transform::template process_tile<1, 0, 4, 3>,
- Transform::template process_tile<1, 0, 4, 4>,
- },
- },
- {
- {
- Transform::template process_tile<1, 1, 0, 0>, // Top left
- Transform::template process_tile<1, 1, 0, 1>,
- Transform::template process_tile<1, 1, 0, 2>,
- Transform::template process_tile<1, 1, 0, 3>,
- Transform::template process_tile<1, 1, 0, 4>,
- },
- {
- Transform::template process_tile<1, 1, 1, 0>,
- Transform::template process_tile<1, 1, 1, 1>,
- Transform::template process_tile<1, 1, 1, 2>,
- Transform::template process_tile<1, 1, 1, 3>,
- Transform::template process_tile<1, 1, 1, 4>,
- },
- {
- Transform::template process_tile<1, 1, 2, 0>,
- Transform::template process_tile<1, 1, 2, 1>,
- Transform::template process_tile<1, 1, 2, 2>,
- Transform::template process_tile<1, 1, 2, 3>,
- Transform::template process_tile<1, 1, 2, 4>,
- },
- {
- Transform::template process_tile<1, 1, 3, 0>,
- Transform::template process_tile<1, 1, 3, 1>,
- Transform::template process_tile<1, 1, 3, 2>,
- Transform::template process_tile<1, 1, 3, 3>,
- Transform::template process_tile<1, 1, 3, 4>,
- },
- {
- Transform::template process_tile<1, 1, 4, 0>,
- Transform::template process_tile<1, 1, 4, 1>,
- Transform::template process_tile<1, 1, 4, 2>,
- Transform::template process_tile<1, 1, 4, 3>,
- Transform::template process_tile<1, 1, 4, 4>,
- }
- }
- }
-};
-
-template struct WinogradGEMM<4, 4, 3, 3>::InputTransform<float>;
-} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp
new file mode 100644
index 0000000..893122c
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+
+namespace
+{
+
+template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
+void winograd_input_transform_6x6_fp32_process_tile(
+ int n_channels,
+ const float* const input_base,
+ const int input_row_stride,
+ const int input_col_stride,
+ float* const matrix_base,
+const int matrix_stride,
+ const int _pad_top,
+ const int _pad_left,
+ const int _pad_bottom,
+ const int _pad_right
+)
+{
+ const int pad_top = Specialized ? PadTop : _pad_top;
+ const int pad_left = Specialized ? PadLeft : _pad_left;
+ const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+ const int pad_right = Specialized ? PadRight : _pad_right;
+
+ constexpr int inner_tile_rows = 6;
+ constexpr int inner_tile_cols = 6;
+
+ const int cells_i = inner_tile_rows - pad_bottom;
+ const int cells_j = inner_tile_cols - pad_right;
+
+ float *outptr = matrix_base;
+
+ // Get pointers into the input tile
+ const float *x_ptrs[inner_tile_rows][inner_tile_cols];
+ for (int i = pad_top, xi = 0; i < cells_i; i++, xi++)
+ {
+ // Get a pointer into the row
+ const float* const row_ptr = input_base + xi*input_row_stride;
+
+ for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
+ {
+ x_ptrs[i][j] = row_ptr + xj*input_col_stride;
+ }
+ }
+
+ // Matrices used/computed in this kernel.
+ float x[inner_tile_rows][inner_tile_cols];
+ float XTx[inner_tile_rows][inner_tile_cols];
+ float U[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[i][j] = XTx[i][j] = 0.0f;
+ }
+ }
+
+ // Perform the Winograd input transformation for each channel in the input
+ // tensor.
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
+ {
+ // Matrices used/computed in this kernel
+ float32x4_t x[inner_tile_rows][inner_tile_cols];
+ float32x4_t XTx[inner_tile_rows][inner_tile_cols];
+ float32x4_t U[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[i][j] = vdupq_n_f32(0.0f);
+ XTx[i][j] = vdupq_n_f32(0.0f);
+ }
+ }
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = pad_top; i < cells_i; i++)
+ {
+ for (int j = pad_left; j < cells_j; j++)
+ {
+ x[i][j] = vld1q_f32(x_ptrs[i][j]);
+ x_ptrs[i][j] += 4;
+ }
+ }
+
+ // Compute XT . x
+ for (int j = pad_left; j < cells_j; j++)
+ {
+ // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
+ XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
+
+ // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
+ XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f);
+
+ // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
+ XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f);
+
+ // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
+ XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f);
+
+ // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
+ XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f);
+
+ // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
+ XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
+ }
+
+ // Compute U = XT . x . X
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
+ U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
+
+ // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
+ U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+ // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
+ U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+ // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
+ U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f);
+
+ // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
+ U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f);
+
+ // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
+ U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
+ }
+
+ // Store the transformed matrix
+ for (int i = 0, m = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++, m++)
+ {
+ vst1q_f32(outptr + m*matrix_stride, U[i][j]);
+ }
+ }
+ outptr += 4;
+ }
+#endif // __aarch64__
+#ifdef __arm_any__
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used/computed in this kernel
+ float32x2_t x[inner_tile_rows][inner_tile_cols];
+ float32x2_t XTx[inner_tile_rows][inner_tile_cols];
+ float32x2_t U[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ x[i][j] = vdup_n_f32(0.0f);
+ XTx[i][j] = vdup_n_f32(0.0f);
+ }
+ }
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = pad_top; i < cells_i; i++)
+ {
+ for (int j = pad_left; j < cells_j; j++)
+ {
+ x[i][j] = vld1_f32(x_ptrs[i][j]);
+ x_ptrs[i][j] += 2;
+ }
+ }
+
+ // Compute XT . x
+ for (int j = pad_left; j < cells_j; j++)
+ {
+ // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
+ XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
+
+ // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
+ XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
+
+ // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
+ XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
+
+ // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
+ XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
+
+ // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
+ XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
+
+ // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
+ XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
+ }
+
+ // Compute U = XT . x . X
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
+ U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
+
+ // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
+ U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+ // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
+ U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+ // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
+ U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
+
+ // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
+ U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
+
+ // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
+ U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
+ }
+
+ // Store the transformed matrix
+ for (int i = 0, m = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++, m++)
+ {
+ vst1_f32(outptr + m*matrix_stride, U[i][j]);
+ }
+ }
+ outptr += 2;
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Load x
+ for (int i = pad_top; i < cells_i; i++)
+ {
+ for (int j = pad_left; j < cells_j; j++)
+ {
+ x[i][j] = *(x_ptrs[i][j]++);
+ }
+ }
+
+ // Compute XT . x
+ for (int j = pad_left; j < cells_j; j++)
+ {
+ XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j];
+ XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j];
+ XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j];
+ XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j];
+ XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j];
+ XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j];
+ }
+
+ // Compute U = XT . x . X
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4];
+ U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4];
+ U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4];
+ U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4];
+ U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4];
+ U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5];
+ }
+
+ // Store the transformed matrix
+ for (int i = 0, m = 0; i < inner_tile_rows; i++)
+ {
+ for (int j = 0; j < inner_tile_cols; j++, m++)
+ {
+ *(outptr + m*matrix_stride) = U[i][j];
+ }
+ }
+ outptr++;
+ }
+}
+}
+
+namespace winograd
+{
+template <int k>
+using Tiles = InputTransformImplTiles<k, k, 6, 6, float>;
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_generic = winograd_input_transform_6x6_fp32_process_tile<false>;
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_unpadded = winograd_input_transform_6x6_fp32_process_tile<true>;
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_top_padded[n_pad_top] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 1, 0, 0, 0>,
+};
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_left_padded[n_pad_left] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 1, 0, 0>,
+};
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_bottom_padded[n_pad_bottom] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 1, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 2, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 3, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 4, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 5, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 6, 0>,
+};
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_right_padded[n_pad_right] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 1>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 2>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 3>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 4>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 5>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 6>,
+};
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_generic = winograd_input_transform_6x6_fp32_process_tile<false>;
+
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_unpadded = winograd_input_transform_6x6_fp32_process_tile<true>;
+
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_top_padded[n_pad_top] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 2, 0, 0, 0>,
+};
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_left_padded[n_pad_left] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 2, 0, 0>,
+};
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_bottom_padded[n_pad_bottom] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 1, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 2, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 3, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 4, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 5, 0>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 6, 0>,
+};
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_right_padded[n_pad_right] = {
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 1>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 2>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 3>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 4>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 5>,
+ winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 6>,
+};
+
+template class InputTransform<3, 3, 6, 6, float>;
+template class InputTransform<5, 5, 6, 6, float>;
+}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp
new file mode 100644
index 0000000..ea842a4
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+
+namespace
+{
+
+template <bool Specialized, int PadRight=0>
+void winograd_output_transform_2_7_fp32_process_tile(
+ const int n_channels,
+ const float* const matrix_base,
+ const int matrix_stride,
+ const float* const biases,
+ float* const output,
+ const int output_row_stride,
+ const int output_col_stride,
+ const int _pad_bottom,
+ const int _pad_right
+)
+{
+ (void) output_row_stride;
+ (void) _pad_bottom;
+ constexpr int output_tile_cols = 2;
+ constexpr int inner_tile_cols = 8;
+
+ const int pad_right = Specialized ? PadRight : _pad_right;
+ const int cells_j = output_tile_cols - pad_right;
+
+
+ // Construct a map to the output cells
+ float *outptrs[cells_j];
+ for (int j = 0; j < cells_j; j++)
+ {
+ outptrs[j] = output + j*output_col_stride;
+ }
+ const float *inptr = matrix_base;
+ const float *bptr = biases;
+
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __arm_any__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
+ {
+ // Matrices used and computed during this transform
+ float32x4_t F[inner_tile_cols], f[output_tile_cols], b = vdupq_n_f32(0.0f);
+
+ // Read a 1x8 tile in the Winograd domain
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ F[j] = vld1q_f32(inptr + j*matrix_stride);
+ }
+ inptr += 4;
+
+ f[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+ f[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[7], 1), F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+
+ // Write out the output tile
+ if (bptr != 0)
+ {
+ b = vld1q_f32(bptr);
+ bptr += 4;
+ }
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[j], f[j] + b);
+ outptrs[j] += 4;
+ }
+ }
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used and computed during this transform
+ float32x2_t F[inner_tile_cols], f[output_tile_cols], b = vdup_n_f32(0.0f);
+
+ // Read a 1x8 tile in the Winograd domain
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ F[j] = vld1_f32(inptr + j*matrix_stride);
+ }
+ inptr += 2;
+
+ f[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+ f[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[7], 1), F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+
+ // Write out the output tile
+ if (bptr != 0)
+ {
+ b = vld1_f32(bptr);
+ bptr += 2;
+ }
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[j], f[j] + b);
+ outptrs[j] += 2;
+ }
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed during this transform
+ float F[inner_tile_cols], f[output_tile_cols], b = 0.0f;
+
+ // Read a 1x8 tile in the Winograd domain
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ F[j] = *(inptr + j*matrix_stride);
+ }
+ inptr++;
+
+ f[0] = F[0]*1 + F[1]*1 + F[2]*1 + F[3]*1 + F[4]*1 + F[5]*1 + F[6]*1;
+ f[1] = F[1]*-1 + F[5]*-3 + F[3]*-2 + F[4]*2 + F[6]*3 + F[2]*1 + F[7]*1;
+
+ // Write out the output tile
+ if (bptr != 0)
+ {
+ b = *(bptr++);
+ }
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[j]++) = f[j] + b;
+ }
+ }
+}
+} // namespace (anonymous)
+
+namespace winograd
+{
+using Tiles = OutputTransformImplTiles<1, 7, 1, 8, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2_7_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+ winograd_output_transform_2_7_fp32_process_tile<true, 1>
+};
+
+template class OutputTransform<1, 7, 1, 8, float>;
+template class OutputTransform<7, 1, 8, 1, float>;
+} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
index 3b3cda0..597b074 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
@@ -23,59 +23,34 @@
*/
#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-namespace winograd
+namespace
{
-using Transform = WinogradGEMM<2, 2, 3, 3>::OutputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &shape)
-{
- // NOTE: Cost in FLOPs rather than instructions or uops.
- const int tile_M = iceildiv(shape.n_rows, 2);
- const int tile_N = iceildiv(shape.n_cols, 2);
- return 24 * tile_M * tile_N * shape.n_channels;
-}
-
-/* F(2x2, 3x3) constructs 2x2 output tiles from a 3x3 convolution. Since we use
- * enough tiles to cover the output space each output tile may contain 0 or 1
- * padded values to the right and bottom columns or rows of the tile, e.g.:
- *
- * ___ ___
- * | | | X|
- * |___| |__X|
- *
- * ___ ___
- * | | | X|
- * |X_X| |X_X|
- *
- *
- * We provide a specialised output transform for each of these instances.
- * Consequently we below construct an array of the various padding options, the
- * array contains pointers to the specific implementations.
- */
-template <>
-template <>
-template <int pad_bottom, int pad_right>
-void Transform::process_tile(
+template <bool Specialized, int PadBottom=0, int PadRight=0>
+void winograd_output_transform_2x2_3x3_fp32_process_tile(
const int n_channels,
const float* const matrix_base,
const int matrix_stride,
const float* const biases,
float* const output,
const int output_row_stride,
- const int output_col_stride
+ const int output_col_stride,
+ const int _pad_bottom,
+ const int _pad_right
)
{
- constexpr int cells_i = 2 - pad_bottom;
- constexpr int cells_j = 2 - pad_right;
+ constexpr int OutputTileRows = 2, OutputTileCols = 2;
+ const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+ const int pad_right = Specialized ? PadRight : _pad_right;
+
+ const int cells_i = OutputTileRows - pad_bottom;
+ const int cells_j = OutputTileCols - pad_right;
// Construct a map to the output cells
- float *outptrs[cells_i][cells_j];
+ float *outptrs[OutputTileRows][OutputTileCols];
for (int i = 0; i < cells_i; i++)
{
for (int j = 0; j < cells_j; j++)
@@ -373,19 +348,28 @@
}
}
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
+} // namespace (anonymous)
+
+namespace winograd
{
- {
- Transform::template process_tile<0, 0>, // No padding
- Transform::template process_tile<0, 1>, // Right padding
- },
- {
- Transform::template process_tile<1, 0>, // Bottom padding
- Transform::template process_tile<1, 1>, // Bottom and right padding
- }
+using Tiles = OutputTransformImplTiles<3, 3, 4, 4, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_2x2_3x3_fp32_process_tile<false>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2x2_3x3_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
+ winograd_output_transform_2x2_3x3_fp32_process_tile<true, 1, 0>
};
-template struct WinogradGEMM<2, 2, 3, 3>::OutputTransform<float>;
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+ winograd_output_transform_2x2_3x3_fp32_process_tile<true, 0, 1>
+};
+
+template class OutputTransform<3, 3, 4, 4, float>;
} // namespace winograd
+
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
index 8668535..60d7181 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
@@ -23,57 +23,34 @@
*/
#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-namespace winograd
+namespace
{
-using Transform = WinogradGEMM<2, 2, 5, 5>::OutputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &shape)
-{
- (void) shape;
- return 0;
-}
-
-/* F(2x2, 5x5) constructs 2x2 output tiles from a 5x5 convolution. Since we use
- * enough tiles to cover the output space each output tile may contain 0 or 1
- * padded values to the right and bottom columns or rows of the tile, e.g.:
- *
- * ___ ___
- * | | | X|
- * |___| |__X|
- *
- * ___ ___
- * | | | X|
- * |X_X| |X_X|
- *
- *
- * We provide a specialised output transform for each of these instances.
- * Consequently we below construct an array of the various padding options, the
- * array contains pointers to the specific implementations.
- */
-template <>
-template <>
-template <int pad_bottom, int pad_right>
-void Transform::process_tile(
+template <bool Specialized, int PadBottom=0, int PadRight=0>
+void winograd_output_transform_2x2_5x5_fp32_process_tile(
const int n_channels,
const float* const matrix_base,
const int matrix_stride,
const float* const biases,
float* const output,
const int output_row_stride,
- const int output_col_stride
+ const int output_col_stride,
+ const int _pad_bottom,
+ const int _pad_right
)
{
- constexpr int cells_i = 2 - pad_bottom;
- constexpr int cells_j = 2 - pad_right;
+ constexpr int OutputTileRows = 2, OutputTileCols = 2;
+ const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+ const int pad_right = Specialized ? PadRight : _pad_right;
+
+ const int cells_i = 2 - pad_bottom;
+ const int cells_j = 2 - pad_right;
// Construct a map to the output cells
- float *outptrs[cells_i][cells_j];
+ float *outptrs[OutputTileRows][OutputTileCols];
for (int i = 0; i < cells_i; i++)
{
for (int j = 0; j < cells_j; j++)
@@ -365,19 +342,28 @@
}
}
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
+} // namespace (anonymous)
+
+namespace winograd
{
- {
- Transform::template process_tile<0, 0>, // No padding
- Transform::template process_tile<0, 1>, // Right padding
- },
- {
- Transform::template process_tile<1, 0>, // Bottom padding
- Transform::template process_tile<1, 1>, // Bottom and right padding
- }
+using Tiles = OutputTransformImplTiles<5, 5, 6, 6, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_2x2_5x5_fp32_process_tile<false>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2x2_5x5_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
+ winograd_output_transform_2x2_5x5_fp32_process_tile<true, 1, 0>
};
-template struct WinogradGEMM<2, 2, 5, 5>::OutputTransform<float>;
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+ winograd_output_transform_2x2_5x5_fp32_process_tile<true, 0, 1>
+};
+
+template class OutputTransform<5, 5, 6, 6, float>;
} // namespace winograd
+
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp
new file mode 100644
index 0000000..911759b
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+
+namespace
+{
+
+template <bool Specialized, int PadRight=0>
+void winograd_output_transform_4_5_fp32_process_tile(
+ const int n_channels,
+ const float* const matrix_base,
+ const int matrix_stride,
+ const float* const biases,
+ float* const output,
+ const int output_row_stride,
+ const int output_col_stride,
+ const int _pad_bottom,
+ const int _pad_right
+)
+{
+ (void) output_row_stride;
+ (void) _pad_bottom;
+ constexpr int output_tile_cols = 4;
+ constexpr int inner_tile_cols = 8;
+
+ const int pad_right = Specialized ? PadRight : _pad_right;
+ const int cells_j = output_tile_cols - pad_right;
+
+ // Construct a map to the output cells
+ float *outptrs[cells_j];
+ for (int j = 0; j < cells_j; j++)
+ {
+ outptrs[j] = output + j*output_col_stride;
+ }
+ const float *inptr = matrix_base;
+ const float *bptr = biases;
+
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __arm_any__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
+ {
+ // Matrices used and computed during this transform
+ float32x4_t F[inner_tile_cols], f[output_tile_cols], b = vdupq_n_f32(0.0f);
+
+ // Read a 1x8 tile in the Winograd domain
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ F[j] = vld1q_f32(inptr + j*matrix_stride);
+ }
+ inptr += 4;
+
+ f[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+ f[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+ f[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
+ f[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[7], 1), F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
+
+ // Write out the output tile
+ if (bptr != 0)
+ {
+ b = vld1q_f32(bptr);
+ bptr += 4;
+ }
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[j], f[j] + b);
+ outptrs[j] += 4;
+ }
+ }
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used and computed during this transform
+ float32x2_t F[inner_tile_cols], f[output_tile_cols], b = vdup_n_f32(0.0f);
+
+ // Read a 1x8 tile in the Winograd domain
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ F[j] = vld1_f32(inptr + j*matrix_stride);
+ }
+ inptr += 2;
+
+ f[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+ f[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+ f[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
+ f[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[7], 1), F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
+
+ // Write out the output tile
+ if (bptr != 0)
+ {
+ b = vld1_f32(bptr);
+ bptr += 2;
+ }
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[j], f[j] + b);
+ outptrs[j] += 2;
+ }
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed during this transform
+ float F[inner_tile_cols], f[output_tile_cols], b = 0.0f;
+
+ // Read a 1x8 tile in the Winograd domain
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ F[j] = *(inptr + j*matrix_stride);
+ }
+ inptr++;
+
+ f[0] = F[0]*1 + F[1]*1 + F[2]*1 + F[3]*1 + F[4]*1 + F[5]*1 + F[6]*1;
+ f[1] = F[1]*-1 + F[5]*-3 + F[3]*-2 + F[4]*2 + F[6]*3 + F[2]*1;
+ f[2] = F[3]*4 + F[4]*4 + F[5]*9 + F[6]*9 + F[1]*1 + F[2]*1;
+ f[3] = F[1]*-1 + F[5]*-27 + F[3]*-8 + F[4]*8 + F[6]*27 + F[2]*1 + F[7]*1;
+
+ // Write out the output tile
+ if (bptr != 0)
+ {
+ b = *(bptr++);
+ }
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[j]++) = f[j] + b;
+ }
+ }
+}
+
+} // namespace (anonymous)
+
+namespace winograd
+{
+using Tiles = OutputTransformImplTiles<1, 5, 1, 8, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4_5_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+ winograd_output_transform_4_5_fp32_process_tile<true, 1>,
+ winograd_output_transform_4_5_fp32_process_tile<true, 2>,
+ winograd_output_transform_4_5_fp32_process_tile<true, 3>
+};
+
+template class OutputTransform<1, 5, 1, 8, float>;
+template class OutputTransform<5, 1, 8, 1, float>;
+} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
index cd3bdef..15cc04b 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
@@ -23,73 +23,34 @@
*/
#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-namespace winograd
+namespace
{
-using Transform = WinogradGEMM<4, 4, 3, 3>::OutputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &shape)
-{
- // NOTE: Cost in FLOPs rather than instructions or uops.
- const int tile_M = iceildiv(shape.n_rows, 4);
- const int tile_N = iceildiv(shape.n_cols, 4);
- return 170 * tile_M * tile_N * shape.n_channels;
-}
-
-/* F(4x4, 3x3) constructs 4x4 output tiles from a 3x3 convolution. Since we use
- * enough tiles to cover the output space each output tile may contain up to 3
- * padded values to the right and bottom columns or rows of the tile, e.g.:
-*
-* ________ ________ ________ ________
-* | | | X| | X X| | X X X|
-* | | | X| | X X| | X X X|
-* | | | X| | X X| | X X X|
-* |_______| |______X| |____X_X| |__X_X_X|
-*
-* ________ ________ ________ ________
-* | | | X| | X X| | X X X|
-* | | | X| | X X| | X X X|
-* | | | X| | X X| | X X X|
-* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X|
-*
-* ________ ________ ________ ________
-* | | | X| | X X| | X X X|
-* | | | X| | X X| | X X X|
-* |X X X X| |X X X X| |X X X X| |X X X X|
-* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X|
-*
-* ________ ________ ________ ________
-* | | | X| | X X| | X X X|
-* |X X X X| |X X X X| |X X X X| |X X X X|
-* |X X X X| |X X X X| |X X X X| |X X X X|
-* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X|
-*
-*
-* We provide a specialised output transform for each of these instances.
-*/
-template <>
-template <>
-template <int pad_bottom, int pad_right>
-void Transform::process_tile(
+template <bool Specialized, int PadBottom=0, int PadRight=0>
+void winograd_output_transform_4x4_3x3_fp32_process_tile(
const int n_channels,
const float* const matrix_base,
const int matrix_stride,
const float* const biases,
float* const output,
const int output_row_stride,
- const int output_col_stride
+ const int output_col_stride,
+ const int _pad_bottom,
+ const int _pad_right
)
{
- constexpr int cells_i = 4 - pad_bottom;
- constexpr int cells_j = 4 - pad_right;
+ const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+ const int pad_right = Specialized ? PadRight : _pad_right;
+ constexpr int TileRows = 4, TileCols = 4;
+
+ const int cells_i = TileRows - pad_bottom;
+ const int cells_j = TileCols - pad_right;
// Construct a map to the output cells
- float *outptrs[cells_i][cells_j];
+ float *outptrs[TileRows][TileCols];
for (int i = 0; i < cells_i; i++)
{
for (int j = 0; j < cells_j; j++)
@@ -437,35 +398,31 @@
}
}
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
+} // namespace (anonymous)
+
+namespace winograd
{
- {
- Transform::template process_tile<0, 0>,
- Transform::template process_tile<0, 1>,
- Transform::template process_tile<0, 2>,
- Transform::template process_tile<0, 3>,
- },
- {
- Transform::template process_tile<1, 0>,
- Transform::template process_tile<1, 1>,
- Transform::template process_tile<1, 2>,
- Transform::template process_tile<1, 3>,
- },
- {
- Transform::template process_tile<2, 0>,
- Transform::template process_tile<2, 1>,
- Transform::template process_tile<2, 2>,
- Transform::template process_tile<2, 3>,
- },
- {
- Transform::template process_tile<3, 0>,
- Transform::template process_tile<3, 1>,
- Transform::template process_tile<3, 2>,
- Transform::template process_tile<3, 3>,
- }
+using Tiles = OutputTransformImplTiles<3, 3, 6, 6, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_4x4_3x3_fp32_process_tile<false>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4x4_3x3_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
+ winograd_output_transform_4x4_3x3_fp32_process_tile<true, 1, 0>,
+ winograd_output_transform_4x4_3x3_fp32_process_tile<true, 2, 0>,
+ winograd_output_transform_4x4_3x3_fp32_process_tile<true, 3, 0>,
};
-template struct WinogradGEMM<4, 4, 3, 3>::OutputTransform<float>;
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+ winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 1>,
+ winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 2>,
+ winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 3>,
+};
+
+template class OutputTransform<3, 3, 6, 6, float>;
} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp
new file mode 100644
index 0000000..58bed71
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+
+namespace
+{
+
+template <bool Specialized, int PadRight=0>
+void winograd_output_transform_6_3_fp32_process_tile(
+ const int n_channels,
+ const float* const matrix_base,
+ const int matrix_stride,
+ const float* const biases,
+ float* const output,
+ const int output_row_stride,
+ const int output_col_stride,
+ const int _pad_bottom,
+ const int _pad_right
+)
+{
+ (void) output_row_stride;
+ (void) _pad_bottom;
+ constexpr int output_tile_cols = 6;
+ constexpr int inner_tile_cols = 8;
+
+ const int pad_right = Specialized ? PadRight : _pad_right;
+ const int cells_j = output_tile_cols - pad_right;
+
+ // Construct a map to the output cells
+ float *outptrs[cells_j];
+ for (int j = 0; j < cells_j; j++)
+ {
+ outptrs[j] = output + j*output_col_stride;
+ }
+ const float *inptr = matrix_base;
+ const float *bptr = biases;
+
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __arm_any__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
+ {
+ // Matrices used and computed during this transform
+ float32x4_t F[inner_tile_cols], f[output_tile_cols], b = vdupq_n_f32(0.0f);
+
+ // Read a 1x8 tile in the Winograd domain
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ F[j] = vld1q_f32(inptr + j*matrix_stride);
+ }
+ inptr += 4;
+
+ f[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+ f[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+ f[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
+ f[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
+ f[4] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 81), F[5], 81), F[4], 16), F[3], 16);
+ f[5] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[7], 1), F[2], 1), F[6], 243), F[4], 32), F[3], -32), F[5], -243), F[1], -1);
+
+ // Write out the output tile
+ if (bptr != 0)
+ {
+ b = vld1q_f32(bptr);
+ bptr += 4;
+ }
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[j], f[j] + b);
+ outptrs[j] += 4;
+ }
+ }
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used and computed during this transform
+ float32x2_t F[inner_tile_cols], f[output_tile_cols], b = vdup_n_f32(0.0f);
+
+ // Read a 1x8 tile in the Winograd domain
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ F[j] = vld1_f32(inptr + j*matrix_stride);
+ }
+ inptr += 2;
+
+ f[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+ f[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+ f[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
+ f[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
+ f[4] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 81), F[5], 81), F[4], 16), F[3], 16);
+ f[5] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[7], 1), F[2], 1), F[6], 243), F[4], 32), F[3], -32), F[5], -243), F[1], -1);
+
+ // Write out the output tile
+ if (bptr != 0)
+ {
+ b = vld1_f32(bptr);
+ bptr += 2;
+ }
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[j], f[j] + b);
+ outptrs[j] += 2;
+ }
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed during this transform
+ float F[inner_tile_cols], f[output_tile_cols], b = 0.0f;
+
+ // Read a 1x8 tile in the Winograd domain
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ F[j] = *(inptr + j*matrix_stride);
+ }
+ inptr++;
+
+ f[0] = F[0]*1 + F[1]*1 + F[2]*1 + F[3]*1 + F[4]*1 + F[5]*1 + F[6]*1;
+ f[1] = F[1]*-1 + F[5]*-3 + F[3]*-2 + F[4]*2 + F[6]*3 + F[2]*1;
+ f[2] = F[3]*4 + F[4]*4 + F[5]*9 + F[6]*9 + F[1]*1 + F[2]*1;
+ f[3] = F[1]*-1 + F[5]*-27 + F[3]*-8 + F[4]*8 + F[6]*27 + F[2]*1;
+ f[4] = F[3]*16 + F[4]*16 + F[5]*81 + F[6]*81 + F[1]*1 + F[2]*1;
+ f[5] = F[1]*-1 + F[5]*-243 + F[3]*-32 + F[4]*32 + F[6]*243 + F[2]*1 + F[7]*1;
+
+ // Write out the output tile
+ if (bptr != 0)
+ {
+ b = *(bptr++);
+ }
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[j]++) = f[j] + b;
+ }
+ }
+}
+
+} // namespace (anonymous)
+
+namespace winograd
+{
+using Tiles = OutputTransformImplTiles<1, 3, 1, 8, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_6_3_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+ winograd_output_transform_6_3_fp32_process_tile<true, 1>,
+ winograd_output_transform_6_3_fp32_process_tile<true, 2>,
+ winograd_output_transform_6_3_fp32_process_tile<true, 3>,
+ winograd_output_transform_6_3_fp32_process_tile<true, 4>,
+ winograd_output_transform_6_3_fp32_process_tile<true, 5>,
+};
+
+template class OutputTransform<1, 3, 1, 8, float>;
+template class OutputTransform<3, 1, 8, 1, float>;
+} // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2_7_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2_7_fp32.cpp
new file mode 100644
index 0000000..85cf418
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2_7_fp32.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
+
+namespace winograd
+{
+ template <>
+ template <>
+ void WinogradGEMM<1, 2, 1, 7>::WeightsTransform<float>::execute(
+ const int n_output_channels,
+ const int n_input_channels,
+ const float* const input, // NOTE: Data in HWIO order
+ float* const output,
+ const int matrix_stride,
+ const int matrix_row_stride
+ )
+ {
+ // Get pointers to each cell of the weight tensor
+ const auto weight_col_stride = n_input_channels * n_output_channels;
+ const float *inptrs[kernel_cols];
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ inptrs[j] = input + j*weight_col_stride;
+ }
+
+ // For each input channel
+ for (int ic = 0; ic < n_input_channels; ic++)
+ {
+ float *outptr = output + ic * matrix_row_stride;
+
+ // For each output channel
+ int channels_remaining = n_output_channels;
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed in this kernel
+ float w[kernel_cols], V[inner_tile_cols];
+
+ // Read weights
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[j] = *(inptrs[j]++);
+ }
+
+ // Compute V = w WT
+ V[0] = (w[0]*-1) / 36.0f;
+ V[1] = (w[1]*-1 + w[3]*-1 + w[5]*-1 + w[0]*1 + w[2]*1 + w[4]*1 + w[6]*1) / 48.0f;
+ V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1 + w[5]*1 + w[6]*1) / 48.0f;
+ V[3] = (w[0]*-1 + w[6]*-64 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8 + w[5]*32) / 120.0f;
+ V[4] = (w[0]*-1 + w[6]*-64 + w[5]*-32 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120.0f;
+ V[5] = (w[5]*-243 + w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[6]*729 + w[0]*1) / 720.0f;
+ V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[5]*243 + w[6]*729 + w[0]*1) / 720.0f;
+ V[7] = (w[6]*1) / 1.0f;
+
+ // Store the transformed weights
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ *(outptr + j*matrix_stride) = V[j];
+ }
+ outptr++;
+ }
+ }
+ }
+
+ template <>
+ template <>
+ int WinogradGEMM<1, 2, 1, 7>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+ {
+ (void) shape;
+ return 0; // TODO
+ }
+
+ template <>
+ template <>
+ void WinogradGEMM<2, 1, 7, 1>::WeightsTransform<float>::execute(
+ const int n_output_channels,
+ const int n_input_channels,
+ const float* const input, // NOTE: Data in HWIO order
+ float* const output,
+ const int matrix_stride,
+ const int matrix_row_stride
+ )
+ {
+ // Redirect to the 1xN implementation
+ WinogradGEMM<1, 2, 1, 7>::template WeightsTransform<float>::execute(
+ n_output_channels, n_input_channels, input, output, matrix_stride,
+ matrix_row_stride
+ );
+ }
+
+ template <>
+ template <>
+ int WinogradGEMM<2, 1, 7, 1>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+ {
+ (void) shape;
+ return 0; // TODO
+ }
+
+ template struct WinogradGEMM<1, 2, 1, 7>::WeightsTransform<float>;
+ template struct WinogradGEMM<2, 1, 7, 1>::WeightsTransform<float>;
+}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp
index 76393c1..2f4f6e1 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp
@@ -401,7 +401,7 @@
template <>
int WinogradGEMM<2, 2, 5, 5>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
{
- return 0;
+ return 0; // TODO
}
template class WinogradGEMM<2, 2, 5, 5>::WeightsTransform<float>;
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_4_5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_4_5_fp32.cpp
new file mode 100644
index 0000000..2f14e20
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_4_5_fp32.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
+
+namespace winograd
+{
+ template <>
+ template <>
+ void WinogradGEMM<1, 4, 1, 5>::WeightsTransform<float>::execute(
+ const int n_output_channels,
+ const int n_input_channels,
+ const float* const input, // NOTE: Data in HWIO order
+ float* const output,
+ const int matrix_stride,
+ const int matrix_row_stride
+ )
+ {
+ // Get pointers to each cell of the weight tensor
+ const auto weight_col_stride = n_input_channels * n_output_channels;
+ const float *inptrs[kernel_cols];
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ inptrs[j] = input + j*weight_col_stride;
+ }
+
+ // For each input channel
+ for (int ic = 0; ic < n_input_channels; ic++)
+ {
+ float *outptr = output + ic * matrix_row_stride;
+
+ // For each output channel
+ int channels_remaining = n_output_channels;
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed in this kernel
+ float w[kernel_cols], V[inner_tile_cols];
+
+ // Read weights
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[j] = *(inptrs[j]++);
+ }
+
+ // Compute V = w WT
+ V[0] = (w[0]*-1) / 36;
+ V[1] = (w[1]*-1 + w[3]*-1 + w[0]*1 + w[2]*1 + w[4]*1) / 48;
+ V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1) / 48;
+ V[3] = (w[0]*-1 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8) / 120;
+ V[4] = (w[0]*-1 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120;
+ V[5] = (w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[0]*1) / 720;
+ V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[0]*1) / 720;
+ V[7] = (w[4]*1) / 1;
+
+ // Store the transformed weights
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ *(outptr + j*matrix_stride) = V[j];
+ }
+ outptr++;
+ }
+ }
+ }
+
+ template <>
+ template <>
+ int WinogradGEMM<1, 4, 1, 5>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+ {
+ (void) shape;
+ return 0; // TODO
+ }
+
+ template <>
+ template <>
+ void WinogradGEMM<4, 1, 5, 1>::WeightsTransform<float>::execute(
+ const int n_output_channels,
+ const int n_input_channels,
+ const float* const input, // NOTE: Data in HWIO order
+ float* const output,
+ const int matrix_stride,
+ const int matrix_row_stride
+ )
+ {
+ // Redirect to the 1xN implementation
+ WinogradGEMM<1, 4, 1, 5>::template WeightsTransform<float>::execute(
+ n_output_channels, n_input_channels, input, output, matrix_stride,
+ matrix_row_stride
+ );
+ }
+
+ template <>
+ template <>
+ int WinogradGEMM<4, 1, 5, 1>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+ {
+ (void) shape;
+ return 0; // TODO
+ }
+
+ template struct WinogradGEMM<1, 4, 1, 5>::WeightsTransform<float>;
+ template struct WinogradGEMM<4, 1, 5, 1>::WeightsTransform<float>;
+}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_6_3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_6_3_fp32.cpp
new file mode 100644
index 0000000..c560aa8
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_6_3_fp32.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
+
+
+namespace winograd
+{
+ template <>
+ template <>
+ void WinogradGEMM<1, 6, 1, 3>::WeightsTransform<float>::execute(
+ const int n_output_channels,
+ const int n_input_channels,
+ const float* const input, // NOTE: Data in HWIO order
+ float* const output,
+ const int matrix_stride,
+ const int matrix_row_stride
+ )
+ {
+ // Get pointers to each cell of the weight tensor
+ const auto weight_col_stride = n_input_channels * n_output_channels;
+ const float *inptrs[3];
+ for (int j = 0; j < 3; j++)
+ {
+ inptrs[j] = input + j*weight_col_stride;
+ }
+
+ // For each input channel
+ for (int ic = 0; ic < n_input_channels; ic++)
+ {
+ float *outptr = output + ic * matrix_row_stride;
+
+ // For each output channel
+ int channels_remaining = n_output_channels;
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Matrices used and computed in this kernel
+ float w[3], V[inner_tile_cols];
+
+ // Read weights
+ for (int j = 0; j < 3; j++)
+ {
+ w[j] = *(inptrs[j]++);
+ }
+
+ // Compute V = w WT
+ V[0] = (w[0]*-1) / 36.0f;
+ V[1] = (w[1]*-1 + w[0]*1 + w[2]*1) / 48.0f;
+ V[2] = (w[0]*1 + w[1]*1 + w[2]*1) / 48.0f;
+ V[3] = (w[0]*-1 + w[2]*-4 + w[1]*2) / 120.0f;
+ V[4] = (w[0]*-1 + w[2]*-4 + w[1]*-2) / 120.0f;
+ V[5] = (w[1]*-3 + w[2]*9 + w[0]*1) / 720.0f;
+ V[6] = (w[1]*3 + w[2]*9 + w[0]*1) / 720.0f;
+ V[7] = (w[2]*1) / 1;
+
+ // Store the transformed weights
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ *(outptr + j*matrix_stride) = V[j];
+ }
+ outptr++;
+ }
+ }
+ }
+
+ template <>
+ template <>
+ int WinogradGEMM<1, 6, 1, 3>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+ {
+ (void) shape;
+ return 0; // TODO
+ }
+
+ template <>
+ template <>
+ void WinogradGEMM<6, 1, 3, 1>::WeightsTransform<float>::execute(
+ const int n_output_channels,
+ const int n_input_channels,
+ const float* const input, // NOTE: Data in HWIO order
+ float* const output,
+ const int matrix_stride,
+ const int matrix_row_stride
+ )
+ {
+ // Redirect to the 1xN implementation
+ WinogradGEMM<1, 6, 1, 3>::template WeightsTransform<float>::execute(
+ n_output_channels, n_input_channels, input, output, matrix_stride,
+ matrix_row_stride
+ );
+ }
+
+ template <>
+ template <>
+ int WinogradGEMM<6, 1, 3, 1>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+ {
+ (void) shape;
+ return 0; // TODO
+ }
+
+ template struct WinogradGEMM<1, 6, 1, 3>::WeightsTransform<float>;
+ template struct WinogradGEMM<6, 1, 3, 1>::WeightsTransform<float>;
+}
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
index a5d4302..a7de2fd 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
@@ -225,4 +225,16 @@
template class WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>;
template class WinogradGEMM<4, 4, 3, 3>::Convolution<float, float>;
+template class WinogradGEMM<1, 6, 1, 3>::Convolution<float, float>;
+template class WinogradGEMM<6, 1, 3, 1>::Convolution<float, float>;
+
template class WinogradGEMM<2, 2, 5, 5>::Convolution<float, float>;
+
+template class WinogradGEMM<1, 4, 1, 5>::Convolution<float, float>;
+template class WinogradGEMM<4, 1, 5, 1>::Convolution<float, float>;
+
+template class WinogradGEMM<1, 2, 1, 7>::Convolution<float, float>;
+template class WinogradGEMM<2, 1, 7, 1>::Convolution<float, float>;
+
+
+
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 11bdbda..39dad8f 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -34,7 +34,7 @@
#include <string>
using namespace arm_compute;
-
+#ifndef DOXYGEN_SKIP_THIS
std::string arm_compute::build_information()
{
static const std::string information =
@@ -42,7 +42,7 @@
;
return information;
}
-
+#endif /* DOXYGEN_SKIP_THIS */
std::string arm_compute::read_file(const std::string &filename, bool binary)
{
std::string out;
@@ -252,6 +252,68 @@
return pool_type_map[type];
}
+const std::string &arm_compute::string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage)
+{
+ static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map =
+ {
+ { GEMMLowpOutputStageType::NONE, "" },
+ { GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down" },
+ { GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint" },
+ { GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float" }
+ };
+
+ return output_stage_map[output_stage];
+}
+
+std::string arm_compute::string_from_pixel_value(const PixelValue &value, const DataType data_type)
+{
+ std::stringstream ss;
+ std::string converted_string;
+
+ switch(data_type)
+ {
+ case DataType::U8:
+ case DataType::QASYMM8:
+ // Needs conversion to 32 bit, otherwise interpreted as ASCII values
+ ss << uint32_t(value.get<uint8_t>());
+ converted_string = ss.str();
+ break;
+ case DataType::S8:
+ // Needs conversion to 32 bit, otherwise interpreted as ASCII values
+ ss << int32_t(value.get<int8_t>());
+ converted_string = ss.str();
+ break;
+ case DataType::U16:
+ ss << value.get<uint16_t>();
+ converted_string = ss.str();
+ break;
+ case DataType::S16:
+ ss << value.get<int16_t>();
+ converted_string = ss.str();
+ break;
+ case DataType::U32:
+ ss << value.get<uint32_t>();
+ converted_string = ss.str();
+ break;
+ case DataType::S32:
+ ss << value.get<int32_t>();
+ converted_string = ss.str();
+ break;
+ case DataType::F32:
+ converted_string = float_to_string_with_full_precision(value.get<float>());
+ break;
+ case DataType::F16:
+ static_assert(sizeof(half) == 2, "Half must be 16 bit");
+ ss << value.get<half>();
+ converted_string = ss.str();
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not handled");
+ }
+
+ return converted_string;
+}
+
std::string arm_compute::lower_string(const std::string &val)
{
std::string res = val;
@@ -274,28 +336,16 @@
return PadStrideInfo(strides.first, strides.second, same_pad_left, same_pad_right, same_pad_top, same_pad_bottom, DimensionRoundingType::CEIL);
}
-TensorShape arm_compute::deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, TensorShape input, TensorShape weights)
-{
- TensorShape out_shape(input);
- out_shape.set(0, out_dims.first);
- out_shape.set(1, out_dims.second);
- out_shape.set(2, weights[3]);
- return out_shape;
-}
-
const std::pair<unsigned int, unsigned int> arm_compute::deconvolution_output_dimensions(
unsigned int in_width, unsigned int in_height, unsigned int kernel_width, unsigned int kernel_height, unsigned int padx, unsigned int pady,
- unsigned int inner_border_right, unsigned int inner_border_top, unsigned int stride_x, unsigned int stride_y)
+ unsigned int stride_x, unsigned int stride_y)
{
ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1);
- ARM_COMPUTE_ERROR_ON(((in_width - 1) * stride_x + kernel_width + inner_border_right) < 2 * padx);
- ARM_COMPUTE_ERROR_ON(((in_height - 1) * stride_y + kernel_height + inner_border_top) < 2 * pady);
- const int padx_deconv = (kernel_width - padx - 1);
- const int pady_deconv = (kernel_height - pady - 1);
- ARM_COMPUTE_ERROR_ON(padx_deconv < 0);
- ARM_COMPUTE_ERROR_ON(pady_deconv < 0);
- const int w = stride_x * (in_width - 1) + kernel_width + inner_border_right - 2 * padx_deconv;
- const int h = stride_y * (in_height - 1) + kernel_height + inner_border_top - 2 * pady_deconv;
+ ARM_COMPUTE_ERROR_ON(((in_width - 1) * stride_x + kernel_width) < 2 * padx);
+ ARM_COMPUTE_ERROR_ON(((in_height - 1) * stride_y + kernel_height) < 2 * pady);
+ const int w = stride_x * (in_width - 1) + kernel_width - 2 * padx;
+ const int h = stride_y * (in_height - 1) + kernel_height - 2 * pady;
+
return std::make_pair<unsigned int, unsigned int>(w, h);
}
diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp
new file mode 100644
index 0000000..a4bce5d
--- /dev/null
+++ b/src/core/utils/helpers/tensor_transform.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+
+namespace arm_compute
+{
+namespace helpers
+{
+namespace tensor_transform
+{
+Coordinates slice_absolute_end_coords(TensorShape input_shape, Coordinates ends)
+{
+ // Create end mask
+ int32_t end_mask = 0;
+ for(unsigned int i = 0; i < ends.num_dimensions(); ++i)
+ {
+ if(ends[i] < 0)
+ {
+ end_mask |= 1 << i;
+ }
+ }
+ // Get unit strides
+ const BiStrides unit_strides = strided_slice_strides(input_shape, BiStrides());
+
+ return strided_slice_absolute_end_coords(input_shape, Coordinates(), ends, unit_strides, end_mask);
+}
+
+TensorShape compute_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends_abs)
+{
+ // Get unit strides
+ const BiStrides unit_strides = strided_slice_strides(input_shape, BiStrides());
+ return compute_strided_slice_output_shape(input_shape, starts, ends_abs, unit_strides);
+}
+
+Coordinates strided_slice_absolute_start_coords(TensorShape input_shape, Coordinates starts, Coordinates strides, int32_t begin_mask)
+{
+ Coordinates starts_abs;
+ for(unsigned int i = 0; i < starts.num_dimensions(); ++i)
+ {
+ // Get start index
+ int start_i = starts[i];
+
+ // Reset in case of begin mask present
+ if((begin_mask & 1 << i) != 0)
+ {
+ start_i = strides[i] > 0 ? std::numeric_limits<int>::lowest() : std::numeric_limits<int>::max();
+ }
+
+ // Account negative start points
+ const int dim_size = input_shape[i];
+ if(start_i < 0)
+ {
+ start_i += dim_size;
+ }
+
+ // Final clamp
+ start_i = utility::clamp(start_i, 0, dim_size - 1);
+ starts_abs.set(i, start_i);
+ }
+
+ // Fill remaining
+ for(unsigned int i = starts_abs.num_dimensions(); i < input_shape.num_dimensions(); ++i)
+ {
+ starts_abs.set(i, 0);
+ }
+
+ return starts_abs;
+}
+
+Coordinates strided_slice_absolute_end_coords(TensorShape input_shape, Coordinates starts_abs, Coordinates ends, Coordinates strides,
+ int32_t end_mask, int32_t shrink_axis_mask)
+{
+ Coordinates ends_abs;
+ for(unsigned int i = 0; i < ends.num_dimensions(); ++i)
+ {
+ // Get end index
+ int stop_i = ends[i];
+
+ // Shrink dimension
+ if((shrink_axis_mask & (1 << i)) != 0)
+ {
+ stop_i = starts_abs[i] + 1;
+ }
+
+ // Reset in case of begin mask present
+ if((end_mask & 1 << i) != 0)
+ {
+ stop_i = (strides[i] > 0) ? std::numeric_limits<int>::max() : std::numeric_limits<int>::lowest();
+ }
+
+ // Account negative end points
+ const int dim_size = input_shape[i];
+ if(stop_i < 0)
+ {
+ stop_i += dim_size;
+ }
+
+ // Final clamp
+ stop_i = (strides[i] > 0) ? utility::clamp(stop_i, 0, dim_size) : utility::clamp(stop_i, -1, dim_size - 1);
+ ends_abs.set(i, stop_i);
+ }
+
+ // Fill remaining ends
+ for(unsigned int i = ends_abs.num_dimensions(); i < input_shape.num_dimensions(); ++i)
+ {
+ ends_abs.set(i, input_shape[i]);
+ }
+
+ return ends_abs;
+}
+
+Coordinates strided_slice_strides(TensorShape input_shape, Coordinates strides)
+{
+ for(unsigned int i = strides.num_dimensions(); i < input_shape.num_dimensions(); ++i)
+ {
+ strides.set(i, 1);
+ }
+ return strides;
+}
+
+TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts_abs, Coordinates ends_abs, Coordinates final_strides)
+{
+ TensorShape output_shape = input_shape;
+ for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+ {
+ const int stride_i = final_strides[i];
+ const int range = ends_abs[i] - starts_abs[i];
+ if((range == 0) || // Zero range
+ (range < 0 && stride_i >= 0) || // Negative range with positive stride
+ (range > 0 && stride_i <= 0)) // Positive range with negative stride
+ {
+ output_shape.set(i, 0);
+ return output_shape;
+ }
+ else
+ {
+ int dim = range / stride_i + (range % stride_i != 0 ? 1 : 0);
+ output_shape.set(i, dim);
+ }
+ }
+ return output_shape;
+}
+} // namespace tensor_transform
+} // namespace helpers
+} // namespace arm_compute
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index 8bb6d8e..ea9ba77 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,21 +30,30 @@
using namespace arm_compute::quantization;
constexpr int64_t fixed_point_one_Q0 = (1ll << 31);
+constexpr float epsilon = 0.00001f;
-arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_less_than_one(double multiplier,
+arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_less_than_one(float multiplier,
int *quant_multiplier,
int *right_shift)
{
ARM_COMPUTE_RETURN_ERROR_ON(quant_multiplier == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON(right_shift == nullptr);
- ARM_COMPUTE_RETURN_ERROR_ON(multiplier < 0);
- ARM_COMPUTE_RETURN_ERROR_ON(multiplier >= 1);
- if(multiplier == 0)
+ ARM_COMPUTE_RETURN_ERROR_ON(multiplier < -epsilon);
+ ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f + epsilon);
+ if(std::fabs(1.0f - multiplier) < epsilon)
+ {
+ *quant_multiplier = 1;
+ *right_shift = 0;
+ return arm_compute::Status{};
+ }
+
+ if(std::fabs(0.0f - multiplier) < epsilon)
{
*quant_multiplier = 0;
*right_shift = 0;
return arm_compute::Status{};
}
+
const double q = std::frexp(multiplier, right_shift);
*right_shift *= -1;
auto q_fixed = static_cast<int64_t>(round(q * fixed_point_one_Q0));
@@ -61,7 +70,7 @@
return arm_compute::Status{};
}
-arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_greater_than_one(double multiplier,
+arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_greater_than_one(float multiplier,
int *quantized_multiplier,
int *left_shift)
{
diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
index 81a18c4..b2ca28d 100644
--- a/src/graph/GraphBuilder.cpp
+++ b/src/graph/GraphBuilder.cpp
@@ -132,7 +132,7 @@
TensorDescriptor common_desc = input_tensor_desc;
common_desc.shape = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
- // Create mean and nodes
+ // Create mean and var nodes
auto mean_nid = add_const_node_with_name(g, params, "Mean", common_desc, std::move(mean_accessor));
auto var_nid = add_const_node_with_name(g, params, "Variance", common_desc, std::move(var_accessor));
@@ -168,6 +168,20 @@
return batch_norm_nid;
}
+NodeID GraphBuilder::add_bounding_box_transform_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info)
+{
+ CHECK_NODEIDX_PAIR(input, g);
+ CHECK_NODEIDX_PAIR(deltas, g);
+
+ NodeID nid = g.add_node<BoundingBoxTransformLayerNode>(info);
+
+ g.add_connection(input.node_id, input.index, nid, 0);
+ g.add_connection(deltas.node_id, deltas.index, nid, 1);
+
+ set_node_params(g, nid, params);
+ return nid;
+}
+
NodeID GraphBuilder::add_channel_shuffle_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_groups)
{
return create_simple_single_input_output_node<ChannelShuffleLayerNode>(g, params, input, num_groups);
@@ -327,7 +341,13 @@
{
TensorDescriptor b_desc = input_tensor_desc;
b_desc.shape = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
- b_nid = add_const_node_with_name(g, params, "Bias", b_desc, std::move(bias_accessor));
+
+ if(is_data_type_quantized_asymmetric(b_desc.data_type))
+ {
+ b_desc.data_type = DataType::S32;
+ }
+
+ b_nid = add_const_node_with_name(g, params, "Bias", b_desc, std::move(bias_accessor));
}
// Create convolution node and connect
@@ -412,11 +432,58 @@
return fc_nid;
}
+NodeID GraphBuilder::add_generate_proposals_node(Graph &g, NodeParams params, NodeIdxPair scores, NodeIdxPair deltas, NodeIdxPair anchors, GenerateProposalsInfo info)
+{
+ CHECK_NODEIDX_PAIR(scores, g);
+ CHECK_NODEIDX_PAIR(deltas, g);
+ CHECK_NODEIDX_PAIR(anchors, g);
+
+ NodeID nid = g.add_node<GenerateProposalsLayerNode>(info);
+
+ g.add_connection(scores.node_id, scores.index, nid, 0);
+ g.add_connection(deltas.node_id, deltas.index, nid, 1);
+ g.add_connection(anchors.node_id, anchors.index, nid, 2);
+
+ set_node_params(g, nid, params);
+ return nid;
+}
+
NodeID GraphBuilder::add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info)
{
return create_simple_single_input_output_node<NormalizationLayerNode>(g, params, input, norm_info);
}
+NodeID GraphBuilder::add_normalize_planar_yuv_node(Graph &g, NodeParams params, NodeIdxPair input,
+ ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr std_accessor)
+{
+ CHECK_NODEIDX_PAIR(input, g);
+
+ // Get input tensor descriptor
+ const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+ // Calculate Common Descriptor
+ TensorDescriptor common_desc = input_tensor_desc;
+ common_desc.shape = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
+
+ // Create mean and std nodes
+ auto mean_nid = add_const_node_with_name(g, params, "Mean", common_desc, std::move(mean_accessor));
+ auto std_nid = add_const_node_with_name(g, params, "Std", common_desc, std::move(std_accessor));
+
+ // Create normalize planar YUV node and add connections
+ NodeID norm_planar_yuv_nid = g.add_node<NormalizePlanarYUVLayerNode>();
+ g.add_connection(input.node_id, input.index, norm_planar_yuv_nid, 0);
+ g.add_connection(mean_nid, 0, norm_planar_yuv_nid, 1);
+ g.add_connection(std_nid, 0, norm_planar_yuv_nid, 2);
+ set_node_params(g, norm_planar_yuv_nid, params);
+
+ return norm_planar_yuv_nid;
+}
+
+NodeID GraphBuilder::add_pad_node(Graph &g, NodeParams params, NodeIdxPair input, PaddingList padding)
+{
+ return create_simple_single_input_output_node<PadLayerNode>(g, params, input, padding);
+}
+
NodeID GraphBuilder::add_permute_node(Graph &g, NodeParams params, NodeIdxPair input, PermutationVector perm, DataLayout layout)
{
return create_simple_single_input_output_node<PermuteLayerNode>(g, params, input, perm, layout);
@@ -427,6 +494,26 @@
return create_simple_single_input_output_node<PoolingLayerNode>(g, params, input, pool_info);
}
+NodeID GraphBuilder::add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, PriorBoxLayerInfo prior_info)
+{
+ CHECK_NODEIDX_PAIR(input0, g);
+ CHECK_NODEIDX_PAIR(input1, g);
+
+ // Create priorbox node and connect
+ NodeID prior_nid = g.add_node<PriorBoxLayerNode>(prior_info);
+ g.add_connection(input0.node_id, input0.index, prior_nid, 0);
+ g.add_connection(input1.node_id, input1.index, prior_nid, 1);
+
+ set_node_params(g, prior_nid, params);
+
+ return prior_nid;
+}
+
+NodeID GraphBuilder::add_reorg_node(Graph &g, NodeParams params, NodeIdxPair input, int stride)
+{
+ return create_simple_single_input_output_node<ReorgLayerNode>(g, params, input, stride);
+}
+
NodeID GraphBuilder::add_reshape_node(Graph &g, NodeParams params, NodeIdxPair input, TensorShape shape)
{
return create_simple_single_input_output_node<ReshapeLayerNode>(g, params, input, shape);
@@ -438,6 +525,20 @@
return create_simple_single_input_output_node<ResizeLayerNode>(g, params, input, policy, width_scale, height_scale);
}
+NodeID GraphBuilder::add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info)
+{
+ CHECK_NODEIDX_PAIR(input, g);
+ CHECK_NODEIDX_PAIR(rois, g);
+
+ NodeID nid = g.add_node<ROIAlignLayerNode>(pool_info);
+
+ g.add_connection(input.node_id, input.index, nid, 0);
+ g.add_connection(rois.node_id, rois.index, nid, 1);
+
+ set_node_params(g, nid, params);
+ return nid;
+}
+
NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams ¶ms, NodeIdxPair input, ITensorAccessorUPtr mul_accessor, ITensorAccessorUPtr add_accessor)
{
CHECK_NODEIDX_PAIR(input, g);
@@ -472,9 +573,24 @@
return create_simple_single_input_output_node<SoftmaxLayerNode>(g, params, input, beta);
}
+NodeID GraphBuilder::add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends)
+{
+ return create_simple_single_input_output_node<SliceLayerNode>(g, params, input, starts, ends);
+}
+
NodeID GraphBuilder::add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis)
{
return create_simple_single_input_output_node<SplitLayerNode>(g, params, input, num_splits, axis);
}
+
+NodeID GraphBuilder::add_upsample_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D info, InterpolationPolicy upsampling_policy)
+{
+ return create_simple_single_input_output_node<UpsampleLayerNode>(g, params, input, info, upsampling_policy);
+}
+
+NodeID GraphBuilder::add_yolo_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info, int32_t num_classes)
+{
+ return create_simple_single_input_output_node<YOLOLayerNode>(g, params, input, act_info, num_classes);
+}
} // namespace graph
} // namespace arm_compute
diff --git a/src/graph/GraphContext.cpp b/src/graph/GraphContext.cpp
index 5f33ed3..037b40b 100644
--- a/src/graph/GraphContext.cpp
+++ b/src/graph/GraphContext.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/graph.h"
#include "arm_compute/graph/Utils.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
namespace arm_compute
{
@@ -75,17 +76,20 @@
void GraphContext::finalize()
{
+ const size_t num_pools = 1;
for(auto &mm_obj : _memory_managers)
{
+ ARM_COMPUTE_ERROR_ON(!mm_obj.second.allocator);
+
// Finalize intra layer memory manager
if(mm_obj.second.intra_mm != nullptr)
{
- mm_obj.second.intra_mm->finalize();
+ mm_obj.second.intra_mm->populate(*mm_obj.second.allocator, num_pools);
}
// Finalize cross layer memory manager
if(mm_obj.second.cross_mm != nullptr)
{
- mm_obj.second.cross_mm->finalize();
+ mm_obj.second.cross_mm->populate(*mm_obj.second.allocator, num_pools);
}
}
}
diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp
index f9d13ac..57c5f9d 100644
--- a/src/graph/GraphManager.cpp
+++ b/src/graph/GraphManager.cpp
@@ -55,6 +55,7 @@
}
// Force target to all graph construct
+ // TODO (geopin01) : Support heterogeneous execution
Target forced_target = target;
if(!is_target_supported(target))
{
@@ -101,7 +102,7 @@
// Register graph
_workloads.insert(std::make_pair(graph.id(), std::move(workload)));
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Created workload for graph with ID : " << graph.id().get() << std::endl);
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Created workload for graph with ID : " << graph.id() << std::endl);
}
void GraphManager::execute_graph(Graph &graph)
@@ -137,4 +138,4 @@
_workloads.erase(it);
}
} // namespace graph
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/PassManager.cpp b/src/graph/PassManager.cpp
index 8ed68bd..92860e2 100644
--- a/src/graph/PassManager.cpp
+++ b/src/graph/PassManager.cpp
@@ -44,9 +44,9 @@
return (index >= _passes.size()) ? nullptr : _passes.at(index).get();
}
-void PassManager::append(std::unique_ptr<IGraphMutator> pass)
+void PassManager::append(std::unique_ptr<IGraphMutator> pass, bool conditional)
{
- if(pass)
+ if(pass && conditional)
{
ARM_COMPUTE_LOG_GRAPH_VERBOSE("Appending mutating pass : " << pass->name() << std::endl);
_passes.push_back(std::move(pass));
diff --git a/src/graph/Utils.cpp b/src/graph/Utils.cpp
index 0a85a7f..71ec548 100644
--- a/src/graph/Utils.cpp
+++ b/src/graph/Utils.cpp
@@ -78,20 +78,16 @@
{
PassManager pm;
+ const bool is_target_gc = target == Target::GC;
+
// Passes that mutate graph IR
+ pm.append(support::cpp14::make_unique<NodeFusionMutator>(), !is_target_gc);
pm.append(support::cpp14::make_unique<GroupedConvolutionMutator>());
- if(target != Target::GC)
- {
- pm.append(support::cpp14::make_unique<NodeFusionMutator>());
- pm.append(support::cpp14::make_unique<InPlaceOperationMutator>());
- }
+ pm.append(support::cpp14::make_unique<InPlaceOperationMutator>(), !is_target_gc);
// Passes that mutate backend information
- if(target != Target::GC)
- {
- pm.append(support::cpp14::make_unique<DepthConcatSubTensorMutator>());
- pm.append(support::cpp14::make_unique<SplitLayerSubTensorMutator>());
- }
+ pm.append(support::cpp14::make_unique<DepthConcatSubTensorMutator>(), !is_target_gc);
+ pm.append(support::cpp14::make_unique<SplitLayerSubTensorMutator>(), !is_target_gc);
pm.append(support::cpp14::make_unique<NodeExecutionMethodMutator>());
return pm;
diff --git a/src/graph/backends/CL/CLDeviceBackend.cpp b/src/graph/backends/CL/CLDeviceBackend.cpp
index 1dbeae9..ae7f0a5 100644
--- a/src/graph/backends/CL/CLDeviceBackend.cpp
+++ b/src/graph/backends/CL/CLDeviceBackend.cpp
@@ -69,6 +69,7 @@
CLDeviceBackend::~CLDeviceBackend()
{
+ // TODO (geopin01) : Shouldn't call non exception safe stuff here
if(_tuner.tune_new_kernels() && !_tuner.lws_table().empty() && !_tuner_file.empty())
{
_tuner.save_to_file(_tuner_file);
@@ -126,6 +127,7 @@
mm_ctx.intra_mm = create_memory_manager(MemoryManagerAffinity::Buffer);
mm_ctx.cross_mm = create_memory_manager(MemoryManagerAffinity::Buffer);
mm_ctx.cross_group = std::make_shared<CLMemoryGroup>(mm_ctx.cross_mm);
+ mm_ctx.allocator = _allocator.get();
ctx.insert_memory_management_ctx(std::move(mm_ctx));
}
@@ -194,8 +196,6 @@
auto pool_mgr = std::make_shared<PoolManager>();
auto mm = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
- mm->set_allocator(_allocator.get());
-
return mm;
}
} // namespace backends
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index bf3dcba..c37a137 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -83,6 +83,8 @@
return detail::create_activation_layer<CLActivationLayer, CLTargetInfo>(*polymorphic_downcast<ActivationLayerNode *>(node));
case NodeType::BatchNormalizationLayer:
return detail::create_batch_normalization_layer<CLBatchNormalizationLayer, CLTargetInfo>(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+ case NodeType::BoundingBoxTransformLayer:
+ return detail::create_bounding_box_transform_layer<CLBoundingBoxTransform, CLTargetInfo>(*polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
case NodeType::ChannelShuffleLayer:
return detail::create_channel_shuffle_layer<CLChannelShuffleLayer, CLTargetInfo>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
case NodeType::ConvolutionLayer:
@@ -99,22 +101,40 @@
return detail::create_flatten_layer<CLFlattenLayer, CLTargetInfo>(*polymorphic_downcast<FlattenLayerNode *>(node));
case NodeType::FullyConnectedLayer:
return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+ case NodeType::GenerateProposalsLayer:
+ return detail::create_generate_proposals_layer<CLGenerateProposalsLayer, CLTargetInfo>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node), ctx);
case NodeType::NormalizationLayer:
return detail::create_normalization_layer<CLNormalizationLayer, CLTargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
+ case NodeType::NormalizePlanarYUVLayer:
+ return detail::create_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer, CLTargetInfo>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
+ case NodeType::PadLayer:
+ return detail::create_pad_layer<CLPadLayer, CLTargetInfo>(*polymorphic_downcast<PadLayerNode *>(node));
case NodeType::PermuteLayer:
return detail::create_permute_layer<CLPermute, CLTargetInfo>(*polymorphic_downcast<PermuteLayerNode *>(node));
case NodeType::PoolingLayer:
return detail::create_pooling_layer<CLPoolingLayer, CLTargetInfo>(*polymorphic_downcast<PoolingLayerNode *>(node));
+ case NodeType::PriorBoxLayer:
+ return detail::create_priorbox_layer<CLPriorBoxLayer, CLTargetInfo>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
+ case NodeType::ReorgLayer:
+ return detail::create_reorg_layer<CLReorgLayer, CLTargetInfo>(*polymorphic_downcast<ReorgLayerNode *>(node));
case NodeType::ReshapeLayer:
return detail::create_reshape_layer<CLReshapeLayer, CLTargetInfo>(*polymorphic_downcast<ReshapeLayerNode *>(node));
case NodeType::ResizeLayer:
return detail::create_resize_layer<CLScale, CLTargetInfo>(*polymorphic_downcast<ResizeLayerNode *>(node));
+ case NodeType::ROIAlignLayer:
+ return detail::create_roi_align_layer<CLROIAlignLayer, CLTargetInfo>(*polymorphic_downcast<ROIAlignLayerNode *>(node));
+ case NodeType::SliceLayer:
+ return detail::create_slice_layer<CLSlice, CLTargetInfo>(*polymorphic_downcast<SliceLayerNode *>(node));
case NodeType::SoftmaxLayer:
return detail::create_softmax_layer<CLSoftmaxLayer, CLTargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+ case NodeType::UpsampleLayer:
+ return detail::create_upsample_layer<CLUpsampleLayer, CLTargetInfo>(*polymorphic_downcast<UpsampleLayerNode *>(node), ctx);
+ case NodeType::YOLOLayer:
+ return detail::create_yolo_layer<CLYOLOLayer, CLTargetInfo>(*polymorphic_downcast<YOLOLayerNode *>(node), ctx);
default:
return nullptr;
}
}
} // namespace backends
} // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index ba5b59d..a070973 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp
@@ -47,6 +47,8 @@
NodeType type = node->type();
switch(type)
{
+ case NodeType::BoundingBoxTransformLayer:
+ return detail::validate_bounding_box_transform_layer<CLBoundingBoxTransform>(*polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
case NodeType::ChannelShuffleLayer:
return detail::validate_channel_shuffle_layer<CLChannelShuffleLayer>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
case NodeType::ConvolutionLayer:
@@ -57,12 +59,30 @@
case NodeType::DepthwiseConvolutionLayer:
return detail::validate_depthwise_convolution_layer<CLDepthwiseConvolutionLayer,
CLDepthwiseConvolutionLayer3x3>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+ case NodeType::GenerateProposalsLayer:
+ return detail::validate_generate_proposals_layer<CLGenerateProposalsLayer>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node));
+ case NodeType::NormalizePlanarYUVLayer:
+ return detail::validate_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
+ case NodeType::PadLayer:
+ return detail::validate_pad_layer<CLPadLayer>(*polymorphic_downcast<PadLayerNode *>(node));
case NodeType::PermuteLayer:
return detail::validate_permute_layer<CLPermute>(*polymorphic_downcast<PermuteLayerNode *>(node));
+ case NodeType::PriorBoxLayer:
+ return detail::validate_priorbox_layer<CLPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
+ case NodeType::ReorgLayer:
+ return detail::validate_reorg_layer<CLReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
+ case NodeType::ROIAlignLayer:
+ return detail::validate_roi_align_layer<CLROIAlignLayer>(*polymorphic_downcast<ROIAlignLayerNode *>(node));
+ case NodeType::SliceLayer:
+ return detail::validate_slice_layer<CLSlice>(*polymorphic_downcast<SliceLayerNode *>(node));
+ case NodeType::UpsampleLayer:
+ return detail::validate_upsample_layer<CLUpsampleLayer>(*polymorphic_downcast<UpsampleLayerNode *>(node));
+ case NodeType::YOLOLayer:
+ return detail::validate_yolo_layer<CLYOLOLayer>(*polymorphic_downcast<YOLOLayerNode *>(node));
default:
return Status{};
}
}
} // namespace backends
} // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/backends/CL/CLTensorHandle.cpp b/src/graph/backends/CL/CLTensorHandle.cpp
index fdb044c..219d9d0 100644
--- a/src/graph/backends/CL/CLTensorHandle.cpp
+++ b/src/graph/backends/CL/CLTensorHandle.cpp
@@ -69,6 +69,7 @@
void CLTensorHandle::release_if_unused()
{
+ // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used
if(!_tensor.is_used())
{
_tensor.allocator()->free();
@@ -101,4 +102,4 @@
}
} // namespace backends
} // namespace graph
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/backends/GLES/GCDeviceBackend.cpp b/src/graph/backends/GLES/GCDeviceBackend.cpp
index ec3cf4f..5f0bf3f 100644
--- a/src/graph/backends/GLES/GCDeviceBackend.cpp
+++ b/src/graph/backends/GLES/GCDeviceBackend.cpp
@@ -86,6 +86,7 @@
mm_ctx.intra_mm = create_memory_manager(MemoryManagerAffinity::Buffer);
mm_ctx.cross_mm = create_memory_manager(MemoryManagerAffinity::Buffer);
mm_ctx.cross_group = std::make_shared<GCMemoryGroup>(mm_ctx.cross_mm);
+ mm_ctx.allocator = &_allocator;
ctx.insert_memory_management_ctx(std::move(mm_ctx));
}
@@ -151,8 +152,6 @@
auto pool_mgr = std::make_shared<PoolManager>();
auto mm = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
- mm->set_allocator(&_allocator);
-
return mm;
}
} // namespace backends
diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
index f72513c..2ca453e 100644
--- a/src/graph/backends/GLES/GCFunctionsFactory.cpp
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp
@@ -94,7 +94,8 @@
func->configure(inputs, output);
// Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.type()
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+ << node.name()
<< " Target " << GCTargetInfo::TargetType
<< " Data Type: " << output->info()->data_type()
<< " Shape: " << output->info()->tensor_shape()
@@ -120,8 +121,9 @@
biases->info()->set_data_type(DataType::S32);
}
- const PadStrideInfo conv_info = node.convolution_info();
- const ConvolutionMethod conv_algorithm = node.convolution_method();
+ const PadStrideInfo conv_info = node.convolution_info();
+ const ConvolutionMethod conv_algorithm = node.convolution_method();
+ const ActivationLayerInfo fused_act = node.fused_activation();
// Create and configure function (we assume that functions have been validated before creation)
std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, GCTargetInfo::TargetType);
@@ -132,23 +134,26 @@
{
std::tie(func, func_name) = create_named_function<GCConvolutionLayerFunctions::DirectConvolutionLayer>(
std::string("DirectConvolutionLayer"),
- input, weights, biases, output, conv_info);
+ input, weights, biases, output, conv_info, fused_act);
}
else
{
std::tie(func, func_name) = create_named_memory_managed_function<GCConvolutionLayerFunctions::GenericConvolutionLayer>(
std::string("ConvolutionLayer"), mm,
- input, weights, biases, output, conv_info);
+ input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1U, 1U), fused_act);
}
// Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+ << node.name()
+ << " Type: " << func_name
<< " Data Type: " << input->info()->data_type()
<< " Input QuantInfo: " << input->info()->quantization_info()
<< " Weights QuantInfo: " << weights->info()->quantization_info()
<< " Input shape: " << input->info()->tensor_shape()
<< " Weights shape: " << weights->info()->tensor_shape()
<< " Output shape: " << output->info()->tensor_shape()
+ << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
<< std::endl);
return func;
}
@@ -169,8 +174,10 @@
biases->info()->set_data_type(DataType::S32);
}
- const PadStrideInfo conv_info = node.convolution_info();
- const DepthwiseConvolutionMethod dwc_algorithm = node.depthwise_convolution_method();
+ const PadStrideInfo conv_info = node.convolution_info();
+ const DepthwiseConvolutionMethod dwc_algorithm = node.depthwise_convolution_method();
+ const unsigned int depth_multiplier = 1;
+ const ActivationLayerInfo fused_act = node.fused_activation();
// Create and configure function (we assume that functions have been validated before creation)
std::unique_ptr<IFunction> func;
@@ -179,7 +186,7 @@
{
std::tie(func, func_name) = create_named_function<GCDepthwiseConvolutionLayerFunctions::DepthwiseConvolutionLayer3x3>(
std::string("DepthwiseConvolutionLayer3x3"),
- input, weights, biases, output, conv_info);
+ input, weights, biases, output, conv_info, depth_multiplier, fused_act);
}
else
{
@@ -187,7 +194,9 @@
}
// Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+ << node.name()
+ << " Type: " << func_name
<< " Target " << GCTargetInfo::TargetType
<< " Data Type: " << input->info()->data_type()
<< " Input QuantInfo: " << input->info()->quantization_info()
@@ -195,6 +204,7 @@
<< " Input shape: " << input->info()->tensor_shape()
<< " Weights shape: " << weights->info()->tensor_shape()
<< " Output shape: " << output->info()->tensor_shape()
+ << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
<< std::endl);
return func;
}
@@ -241,11 +251,13 @@
}
// Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.type()
- << " Target " << GCTargetInfo::TargetType
- << " Operation " << func_name
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+ << node.name()
+ << " Type: " << node.type()
+ << " Target: " << GCTargetInfo::TargetType
+ << " Operation: " << func_name
<< " Data Type: " << input1->info()->data_type()
- << " Shape : " << input1->info()->tensor_shape()
+ << " Shape: " << input1->info()->tensor_shape()
<< std::endl);
return func;
@@ -278,6 +290,8 @@
return detail::create_fully_connected_layer<GCFullyConnectedLayer, GCTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
case NodeType::NormalizationLayer:
return detail::create_normalization_layer<GCNormalizationLayer, GCTargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
+ case NodeType::NormalizePlanarYUVLayer:
+ return detail::create_normalize_planar_yuv_layer<GCNormalizePlanarYUVLayer, GCTargetInfo>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
case NodeType::PoolingLayer:
return detail::create_pooling_layer<GCPoolingLayer, GCTargetInfo>(*polymorphic_downcast<PoolingLayerNode *>(node));
case NodeType::ResizeLayer:
@@ -290,4 +304,4 @@
}
} // namespace backends
} // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/backends/GLES/GCNodeValidator.cpp b/src/graph/backends/GLES/GCNodeValidator.cpp
index 53049c7..fe69c7a 100644
--- a/src/graph/backends/GLES/GCNodeValidator.cpp
+++ b/src/graph/backends/GLES/GCNodeValidator.cpp
@@ -55,6 +55,7 @@
arm_compute::ITensorInfo *weights = detail::get_backing_tensor_info(node.input(1));
ARM_COMPUTE_ERROR_ON(weights == nullptr);
+ // TODO (geopin01) : Switch when validation is implemented
// Validate function
ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->tensor_shape().x() != 3 && weights->tensor_shape().y() != 3, "Unsupported depthwise convolution");
node.set_depthwise_convolution_method(DepthwiseConvolutionMethod::Optimized3x3);
@@ -102,6 +103,8 @@
NodeType type = node->type();
switch(type)
{
+ case NodeType::BoundingBoxTransformLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : BoundingBoxTransformLayer");
case NodeType::ChannelShuffleLayer:
return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ChannelShuffleLayer");
case NodeType::ConvolutionLayer:
@@ -110,10 +113,28 @@
return validate_depthwise_convolution_layer(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
case NodeType::FlattenLayer:
return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : FlattenLayer");
+ case NodeType::GenerateProposalsLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : GenerateProposalsLayer");
+ case NodeType::NormalizePlanarYUVLayer:
+ return detail::validate_normalize_planar_yuv_layer<GCNormalizePlanarYUVLayer>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
+ case NodeType::PadLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : PadLayer");
case NodeType::PermuteLayer:
return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : PermuteLayer");
+ case NodeType::PriorBoxLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : PriorBoxLayer");
+ case NodeType::ReorgLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ReorgLayer");
case NodeType::ReshapeLayer:
return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ReshapeLayer");
+ case NodeType::ROIAlignLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ROIAlignLayer");
+ case NodeType::SliceLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : SliceLayer");
+ case NodeType::UpsampleLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : UpsampleLayer");
+ case NodeType::YOLOLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : YOLOLayer");
default:
return Status{};
}
diff --git a/src/graph/backends/GLES/GCTensorHandle.cpp b/src/graph/backends/GLES/GCTensorHandle.cpp
index 6f96263..4e5c652 100644
--- a/src/graph/backends/GLES/GCTensorHandle.cpp
+++ b/src/graph/backends/GLES/GCTensorHandle.cpp
@@ -69,6 +69,7 @@
void GCTensorHandle::release_if_unused()
{
+ // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used
if(!_tensor.is_used())
{
_tensor.allocator()->free();
@@ -101,4 +102,4 @@
}
} // namespace backends
} // namespace graph
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/backends/NEON/NEDeviceBackend.cpp b/src/graph/backends/NEON/NEDeviceBackend.cpp
index 5fc44d0..23ced2f 100644
--- a/src/graph/backends/NEON/NEDeviceBackend.cpp
+++ b/src/graph/backends/NEON/NEDeviceBackend.cpp
@@ -86,6 +86,7 @@
mm_ctx.intra_mm = create_memory_manager(MemoryManagerAffinity::Offset);
mm_ctx.cross_mm = create_memory_manager(MemoryManagerAffinity::Offset);
mm_ctx.cross_group = std::make_shared<MemoryGroup>(mm_ctx.cross_mm);
+ mm_ctx.allocator = &_allocator;
ctx.insert_memory_management_ctx(std::move(mm_ctx));
}
@@ -156,8 +157,6 @@
auto pool_mgr = std::make_shared<PoolManager>();
auto mm = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
- mm->set_allocator(&_allocator);
-
return mm;
}
} // namespace backends
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index 36a25ad..ca8d485 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -90,13 +90,16 @@
NETargetInfo::TensorType *biases = get_backing_tensor<NETargetInfo>(node.input(2));
NETargetInfo::TensorType *output = get_backing_tensor<NETargetInfo>(node.output(0));
- if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+
+ if(is_quantized)
{
biases->info()->set_data_type(DataType::S32);
}
- const PadStrideInfo conv_info = node.convolution_info();
- const ConvolutionMethod conv_algorithm = node.convolution_method();
+ const PadStrideInfo conv_info = node.convolution_info();
+ const ConvolutionMethod conv_algorithm = node.convolution_method();
+ const ActivationLayerInfo fused_act = node.fused_activation();
// Create and configure function (we assume that functions have been validated before creation)
std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, Target::NEON);
@@ -105,33 +108,40 @@
if(conv_algorithm == ConvolutionMethod::Direct)
{
std::tie(func, func_name) = create_named_memory_managed_function<NEDirectConvolutionLayer>(
- std::string("DirectConvolutionLayer"), mm, input, weights, biases, output, conv_info);
+ std::string("DirectConvolutionLayer"), mm, input, weights, biases, output, conv_info, fused_act);
}
else if(conv_algorithm == ConvolutionMethod::GEMM)
{
std::tie(func, func_name) = create_named_memory_managed_function<NEGEMMConvolutionLayer>(
- std::string("GEMMConvolutionLayer"), mm, input, weights, biases, output, conv_info);
+ std::string("GEMMConvolutionLayer"), mm, input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1, 1), fused_act);
}
else if(conv_algorithm == ConvolutionMethod::Winograd)
{
std::tie(func, func_name) = create_named_memory_managed_function<NEWinogradConvolutionLayer>(
- std::string("WinogradConvolutionLayer"), mm, input, weights, biases, output, conv_info);
+ std::string("WinogradConvolutionLayer"), mm, input, weights, biases, output, conv_info, fused_act);
}
else
{
std::tie(func, func_name) = create_named_memory_managed_function<NEConvolutionLayer>(
- std::string("ConvolutionLayer"), mm, input, weights, biases, output, conv_info);
+ std::string("ConvolutionLayer"), mm, input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1, 1), fused_act);
}
// Log info
+ std::ostringstream qss;
+ if(is_quantized)
+ {
+ qss << " Input QuantInfo: " << input->info()->quantization_info()
+ << " Weights QuantInfo: " << weights->info()->quantization_info()
+ << " Output QuantInfo: " << output->info()->quantization_info();
+ }
ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
<< " Target " << NETargetInfo::TargetType
<< " Data Type: " << input->info()->data_type()
- << " Input QuantInfo: " << input->info()->quantization_info()
- << " Weights QuantInfo: " << weights->info()->quantization_info()
+ << qss.str()
<< " Input shape: " << input->info()->tensor_shape()
<< " Weights shape: " << weights->info()->tensor_shape()
<< " Output shape: " << output->info()->tensor_shape()
+ << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
<< std::endl);
return func;
}
@@ -153,8 +163,10 @@
func->configure(input, output, norm_info);
// Log info
- ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.type()
- << " Target " << NETargetInfo::TargetType
+ ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+ << node.name()
+ << " Type: " << node.type()
+ << " Target: " << NETargetInfo::TargetType
<< " Data Type: " << input->info()->data_type()
<< " Input shape: " << input->info()->tensor_shape()
<< " Output shape: " << output->info()->tensor_shape()
@@ -179,6 +191,8 @@
return detail::create_activation_layer<NEActivationLayer, NETargetInfo>(*polymorphic_downcast<ActivationLayerNode *>(node));
case NodeType::BatchNormalizationLayer:
return detail::create_batch_normalization_layer<NEBatchNormalizationLayer, NETargetInfo>(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+ case NodeType::ChannelShuffleLayer:
+ return detail::create_channel_shuffle_layer<NEChannelShuffleLayer, NETargetInfo>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
case NodeType::ConvolutionLayer:
return detail::create_convolution_layer<NEConvolutionLayerFunctions, NETargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
case NodeType::DeconvolutionLayer:
@@ -199,16 +213,24 @@
return detail::create_permute_layer<NEPermute, NETargetInfo>(*polymorphic_downcast<PermuteLayerNode *>(node));
case NodeType::PoolingLayer:
return detail::create_pooling_layer<NEPoolingLayer, NETargetInfo>(*polymorphic_downcast<PoolingLayerNode *>(node));
+ case NodeType::PriorBoxLayer:
+ return detail::create_priorbox_layer<NEPriorBoxLayer, NETargetInfo>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
+ case NodeType::ReorgLayer:
+ return detail::create_reorg_layer<NEReorgLayer, NETargetInfo>(*polymorphic_downcast<ReorgLayerNode *>(node));
case NodeType::ReshapeLayer:
return detail::create_reshape_layer<NEReshapeLayer, NETargetInfo>(*polymorphic_downcast<ReshapeLayerNode *>(node));
case NodeType::ResizeLayer:
return detail::create_resize_layer<NEScale, NETargetInfo>(*polymorphic_downcast<ResizeLayerNode *>(node));
case NodeType::SoftmaxLayer:
return detail::create_softmax_layer<NESoftmaxLayer, NETargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+ case NodeType::UpsampleLayer:
+ return detail::create_upsample_layer<NEUpsampleLayer, NETargetInfo>(*polymorphic_downcast<UpsampleLayerNode *>(node), ctx);
+ case NodeType::YOLOLayer:
+ return detail::create_yolo_layer<NEYOLOLayer, NETargetInfo>(*polymorphic_downcast<YOLOLayerNode *>(node), ctx);
default:
return nullptr;
}
}
} // namespace backends
} // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index 58ffaf0..a2abc83 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp
@@ -47,8 +47,10 @@
NodeType type = node->type();
switch(type)
{
+ case NodeType::BoundingBoxTransformLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : BoundingBoxTransformLayer");
case NodeType::ChannelShuffleLayer:
- return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ChannelShuffleLayer");
+ return detail::validate_channel_shuffle_layer<NEChannelShuffleLayer>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
case NodeType::ConvolutionLayer:
return detail::validate_convolution_layer<NEConvolutionLayer,
NEDirectConvolutionLayer,
@@ -57,12 +59,30 @@
case NodeType::DepthwiseConvolutionLayer:
return detail::validate_depthwise_convolution_layer<NEDepthwiseConvolutionLayer,
NEDepthwiseConvolutionLayer3x3>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+ case NodeType::GenerateProposalsLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : GenerateProposalsLayer");
+ case NodeType::NormalizePlanarYUVLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : NormalizePlanarYUVLayer");
+ case NodeType::PadLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : PadLayer");
case NodeType::PermuteLayer:
return detail::validate_permute_layer<NEPermute>(*polymorphic_downcast<PermuteLayerNode *>(node));
+ case NodeType::PriorBoxLayer:
+ return detail::validate_priorbox_layer<NEPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
+ case NodeType::ReorgLayer:
+ return detail::validate_reorg_layer<NEReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
+ case NodeType::ROIAlignLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ROIAlignLayer");
+ case NodeType::SliceLayer:
+ return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : SliceLayer");
+ case NodeType::UpsampleLayer:
+ return detail::validate_upsample_layer<NEUpsampleLayer>(*polymorphic_downcast<UpsampleLayerNode *>(node));
+ case NodeType::YOLOLayer:
+ return detail::validate_yolo_layer<NEYOLOLayer>(*polymorphic_downcast<YOLOLayerNode *>(node));
default:
return Status{};
}
}
} // namespace backends
} // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/backends/NEON/NETensorHandle.cpp b/src/graph/backends/NEON/NETensorHandle.cpp
index caa2c10..5892116 100644
--- a/src/graph/backends/NEON/NETensorHandle.cpp
+++ b/src/graph/backends/NEON/NETensorHandle.cpp
@@ -68,6 +68,7 @@
void NETensorHandle::release_if_unused()
{
+ // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used
if(!_tensor.is_used())
{
_tensor.allocator()->free();
@@ -100,4 +101,4 @@
}
} // namespace backends
} // namespace graph
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
index 6b2f68c..7fc5ca0 100644
--- a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
+++ b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
@@ -87,6 +87,7 @@
// If its a const node:
if(node != nullptr && const_node_types.find(node->type()) != std::end(const_node_types))
{
+ // TODO (geopin01) : Create IO iterator wrappers
// Add all its inputs / outputs to the list of constant handles
for(unsigned int i = 0; i < node->num_inputs(); ++i)
{
diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
index f479963..f2c381b 100644
--- a/src/graph/detail/ExecutionHelpers.cpp
+++ b/src/graph/detail/ExecutionHelpers.cpp
@@ -254,7 +254,8 @@
bool is_valid = true;
std::for_each(std::begin(workload.outputs), std::end(workload.outputs), [&](Tensor * output_tensor)
{
- is_valid = is_valid && (output_tensor != nullptr) && output_tensor->call_accessor();
+ bool valid_output = (output_tensor != nullptr) && output_tensor->call_accessor();
+ is_valid = is_valid && valid_output;
});
return is_valid;
diff --git a/src/graph/mutators/GroupedConvolutionMutator.cpp b/src/graph/mutators/GroupedConvolutionMutator.cpp
index 0d65d6a..d69d2cd 100644
--- a/src/graph/mutators/GroupedConvolutionMutator.cpp
+++ b/src/graph/mutators/GroupedConvolutionMutator.cpp
@@ -41,7 +41,7 @@
namespace
{
NodeID create_grouped_convolution(Graph &g, const NodeParams ¶ms, NodeIdxPair input, NodeID weights, NodeID bias,
- PadStrideInfo conv_info, ConvolutionMethod method, FastMathHint fast_math_hint, unsigned int num_groups)
+ PadStrideInfo conv_info, ConvolutionMethod method, ActivationLayerInfo fused_act, FastMathHint fast_math_hint, unsigned int num_groups)
{
bool has_bias = (bias != EmptyNodeID);
@@ -86,6 +86,10 @@
ARM_COMPUTE_ERROR_ON(node == nullptr);
node->set_common_node_parameters(group_params);
+ // Down-cast node
+ auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node);
+ conv_node->set_fused_activation(fused_act);
+
convolution_outputs.push_back({ conv_nid, 0 });
}
@@ -127,17 +131,20 @@
auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node);
// Get internal convolution info
- const PadStrideInfo conv_info = conv_node->convolution_info();
- const ConvolutionMethod conv_method = conv_node->convolution_method();
- const FastMathHint fast_math_hint = conv_node->fast_math_hint();
- const unsigned int num_groups = conv_node->num_groups();
- const NodeParams params = conv_node->common_node_params();
- const Target assigned_target = conv_node->assigned_target();
+ // TODO (geopin01) : Create a descriptor or a clone interface
+ const PadStrideInfo conv_info = conv_node->convolution_info();
+ const ConvolutionMethod conv_method = conv_node->convolution_method();
+ const ActivationLayerInfo fused_act_info = conv_node->fused_activation();
+ const FastMathHint fast_math_hint = conv_node->fast_math_hint();
+ const unsigned int num_groups = conv_node->num_groups();
+ const NodeParams params = conv_node->common_node_params();
+ const Target assigned_target = conv_node->assigned_target();
// Extract node ids
- const NodeID input_id = conv_node->input_id(0);
- const NodeID weights_id = conv_node->input_id(1);
- const NodeID bias_id = conv_node->input_id(2);
+ ARM_COMPUTE_ERROR_ON(conv_node->input_edge(0) == nullptr || conv_node->input_edge(1) == nullptr);
+ const NodeID input_id = conv_node->input_edge(0)->producer()->id();
+ const NodeID weights_id = conv_node->input_edge(1)->producer()->id();
+ const NodeID bias_id = (conv_node->input_edge(2) != nullptr) ? conv_node->input_edge(2)->producer()->id() : EmptyNodeID;
// Get driving nodes
std::vector<NodeIdxPair> driving_nodes = get_driving_nodes(*node);
@@ -151,7 +158,7 @@
// Create grouped convolution node
NodeID grouped_conv_id = create_grouped_convolution(g, params, { input_id, 0 }, weights_id, bias_id,
- conv_info, conv_method, fast_math_hint, num_groups);
+ conv_info, conv_method, fused_act_info, fast_math_hint, num_groups);
// Remove convolution node
g.remove_node(node->id());
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 82bfe25..9dc02d1 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -38,44 +38,49 @@
{
namespace detail
{
-void fuse_batch_norm_with_activation(Graph &g)
+template <typename N>
+void fuse_node_with_activation(Graph &g,
+ const std::set<Activation> &supported_fused_activations,
+ std::function<bool(INode &)> const &prec)
{
- // Supported activations when fusing
- const std::set<Activation> supported_fused_activations = { Activation::RELU, Activation::BOUNDED_RELU, Activation::LU_BOUNDED_RELU };
-
// Not interested in the order of nodes
for(auto &node : g.nodes())
{
- // Check if the node is batch norm and not a branching node
- if(node && node->type() == NodeType::BatchNormalizationLayer && node->output_edges().size() == 1)
+ // Check if the node is of type N and not a branching node
+ if(node && node->type() == N::node_type && node->output_edges().size() == 1)
{
auto output_edge_id = *node->output_edges().begin();
auto output_edge = g.edge(output_edge_id);
// Check if following node is an activation layer node
if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && (output_edge->consumer()->type() == NodeType::ActivationLayer))
{
- auto *bn_node = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->producer());
+ auto *n_node = arm_compute::utils::cast::polymorphic_downcast<N *>(output_edge->producer());
auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(output_edge->consumer());
- ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || bn_node->output(0) == nullptr);
+ ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || n_node->output(0) == nullptr);
+ // Check given precondition
+ if(!prec(*n_node))
+ {
+ continue;
+ }
// Check if activation is supported for fusion
if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
{
continue;
}
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing Batch Normalization node with ID : " << output_edge->producer_id()
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing node with ID : " << output_edge->producer_id()
<< " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl);
- // Prevent fusion if batch normalization node has an output accessor
- if(bn_node->output(0)->accessor() == nullptr)
+ // Prevent fusion if fused node has an output accessor
+ if(n_node->output(0)->accessor() == nullptr)
{
// Get driving nodes of activation node
std::vector<NodeIdxPair> act_driving_nodes = get_driving_nodes(*act_node);
- // Set activation info to batch normalization
- bn_node->set_fused_activation(act_node->activation_info());
+ // Set activation info to fused node
+ n_node->set_fused_activation(act_node->activation_info());
// Extract activation node accessor if any
auto act_node_accessor = act_node->output(0)->extract_accessor();
@@ -83,18 +88,18 @@
// Remove activation node
g.remove_node(act_node->id());
- // Update batch normalization node outputs
+ // Update fused node outputs
for(auto &driving_node : act_driving_nodes)
{
- g.add_connection(bn_node->id(), 0, driving_node.node_id, driving_node.index);
+ g.add_connection(n_node->id(), 0, driving_node.node_id, driving_node.index);
}
- // Update accessor to batch normalization node
- bn_node->output(0)->set_accessor(std::move(act_node_accessor));
+ // Update accessor to fused node
+ n_node->output(0)->set_accessor(std::move(act_node_accessor));
}
else
{
- ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion as batch normalization node has an output accessor\n");
+ ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of node with activation due to the presence of an output accessor\n");
}
}
}
@@ -109,7 +114,24 @@
void NodeFusionMutator::mutate(Graph &g)
{
- detail::fuse_batch_norm_with_activation(g);
+ // Supported activations when fusing
+ const std::set<Activation> supported_fused_activations = { Activation::RELU, Activation::BOUNDED_RELU, Activation::LU_BOUNDED_RELU };
+
+ // Preconditions
+ auto empty_prec = [](INode & n)
+ {
+ return true;
+ };
+ auto qs8_prec = [](INode & n)
+ {
+ ARM_COMPUTE_ERROR_ON(n.output(0) == nullptr);
+ return n.output(0)->desc().data_type == DataType::QASYMM8;
+ };
+
+ // Fusion mutations
+ detail::fuse_node_with_activation<BatchNormalizationLayerNode>(g, supported_fused_activations, empty_prec);
+ detail::fuse_node_with_activation<ConvolutionLayerNode>(g, supported_fused_activations, empty_prec);
+ detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>(g, supported_fused_activations, qs8_prec);
}
} // namespace graph
} // namespace arm_compute
diff --git a/src/graph/nodes/BatchNormalizationLayerNode.cpp b/src/graph/nodes/BatchNormalizationLayerNode.cpp
index 3ae11fc..3d392bd 100644
--- a/src/graph/nodes/BatchNormalizationLayerNode.cpp
+++ b/src/graph/nodes/BatchNormalizationLayerNode.cpp
@@ -78,7 +78,7 @@
NodeType BatchNormalizationLayerNode::type() const
{
- return NodeType::BatchNormalizationLayer;
+ return BatchNormalizationLayerNode::node_type;
}
void BatchNormalizationLayerNode::accept(INodeVisitor &v)
diff --git a/src/graph/nodes/BoundingBoxTransformLayerNode.cpp b/src/graph/nodes/BoundingBoxTransformLayerNode.cpp
new file mode 100644
index 0000000..ad261e3
--- /dev/null
+++ b/src/graph/nodes/BoundingBoxTransformLayerNode.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+BoundingBoxTransformLayerNode::BoundingBoxTransformLayerNode(BoundingBoxTransformInfo &info)
+ : _bbox_info(info)
+{
+ _input_edges.resize(2, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+const BoundingBoxTransformInfo &BoundingBoxTransformLayerNode::info() const
+{
+ return _bbox_info;
+}
+
+bool BoundingBoxTransformLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor BoundingBoxTransformLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *deltas = input(1);
+ ARM_COMPUTE_ERROR_ON(deltas == nullptr);
+
+ TensorDescriptor output_desc = deltas->desc();
+ return output_desc;
+}
+
+NodeType BoundingBoxTransformLayerNode::type() const
+{
+ return NodeType::BoundingBoxTransformLayer;
+}
+
+void BoundingBoxTransformLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/ConvolutionLayerNode.cpp b/src/graph/nodes/ConvolutionLayerNode.cpp
index e9cb039..15c7ff6 100644
--- a/src/graph/nodes/ConvolutionLayerNode.cpp
+++ b/src/graph/nodes/ConvolutionLayerNode.cpp
@@ -37,7 +37,7 @@
ConvolutionMethod method,
FastMathHint fast_math_hint,
QuantizationInfo out_quant_info)
- : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(out_quant_info)
+ : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(out_quant_info), _fused_activation()
{
_input_edges.resize(3, EmptyEdgeID);
_outputs.resize(1, NullTensorID);
@@ -73,6 +73,16 @@
return _num_groups;
}
+ActivationLayerInfo ConvolutionLayerNode::fused_activation() const
+{
+ return _fused_activation;
+}
+
+void ConvolutionLayerNode::set_fused_activation(ActivationLayerInfo fused_activation)
+{
+ _fused_activation = fused_activation;
+}
+
TensorDescriptor ConvolutionLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
const TensorDescriptor &weights_descriptor,
const PadStrideInfo &info)
@@ -126,7 +136,7 @@
NodeType ConvolutionLayerNode::type() const
{
- return NodeType::ConvolutionLayer;
+ return ConvolutionLayerNode::node_type;
}
void ConvolutionLayerNode::accept(INodeVisitor &v)
diff --git a/src/graph/nodes/DeconvolutionLayerNode.cpp b/src/graph/nodes/DeconvolutionLayerNode.cpp
index 9329ae3..e7ccffd 100644
--- a/src/graph/nodes/DeconvolutionLayerNode.cpp
+++ b/src/graph/nodes/DeconvolutionLayerNode.cpp
@@ -51,8 +51,7 @@
TensorDescriptor DeconvolutionLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
const TensorDescriptor &weights_descriptor,
- const PadStrideInfo &info,
- const Size2D &inner_border)
+ const PadStrideInfo &info)
{
unsigned int output_width = 0;
unsigned int output_height = 0;
@@ -65,7 +64,6 @@
std::tie(output_width, output_height) = deconvolution_output_dimensions(input_width, input_height,
kernel_width, kernel_height,
info.pad().first, info.pad().second,
- inner_border.x(), inner_border.y(),
info.stride().first, info.stride().second);
TensorDescriptor output_descriptor = input_descriptor;
@@ -96,7 +94,7 @@
ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
- TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info, _inner_border);
+ TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
return output_info;
}
diff --git a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
index 1a6f8d3..02d1632 100644
--- a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
+++ b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
@@ -33,7 +33,7 @@
namespace graph
{
DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, DepthwiseConvolutionMethod method)
- : _info(std::move(info)), _method(method)
+ : _info(std::move(info)), _method(method), _fused_activation()
{
_input_edges.resize(3, EmptyEdgeID);
_outputs.resize(1, NullTensorID);
@@ -54,6 +54,16 @@
return _info;
}
+ActivationLayerInfo DepthwiseConvolutionLayerNode::fused_activation() const
+{
+ return _fused_activation;
+}
+
+void DepthwiseConvolutionLayerNode::set_fused_activation(ActivationLayerInfo fused_activation)
+{
+ _fused_activation = fused_activation;
+}
+
TensorDescriptor DepthwiseConvolutionLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
const TensorDescriptor &weights_descriptor,
const PadStrideInfo &info)
@@ -100,7 +110,7 @@
NodeType DepthwiseConvolutionLayerNode::type() const
{
- return NodeType::DepthwiseConvolutionLayer;
+ return DepthwiseConvolutionLayerNode::node_type;
}
void DepthwiseConvolutionLayerNode::accept(INodeVisitor &v)
diff --git a/src/graph/nodes/FlattenLayerNode.cpp b/src/graph/nodes/FlattenLayerNode.cpp
index 78b45dc..baae555 100644
--- a/src/graph/nodes/FlattenLayerNode.cpp
+++ b/src/graph/nodes/FlattenLayerNode.cpp
@@ -57,7 +57,7 @@
ARM_COMPUTE_ERROR_ON(src == nullptr);
TensorDescriptor output_desc = src->desc();
- output_desc.shape.collapse(src->desc().shape.num_dimensions());
+ output_desc.shape.collapse(3);
return output_desc;
}
diff --git a/src/graph/nodes/GenerateProposalsLayerNode.cpp b/src/graph/nodes/GenerateProposalsLayerNode.cpp
new file mode 100644
index 0000000..7367e80
--- /dev/null
+++ b/src/graph/nodes/GenerateProposalsLayerNode.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/GenerateProposalsLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+GenerateProposalsLayerNode::GenerateProposalsLayerNode(GenerateProposalsInfo &info)
+ : _info(info)
+{
+ _input_edges.resize(3, EmptyEdgeID);
+ _outputs.resize(3, NullTensorID);
+}
+
+const GenerateProposalsInfo &GenerateProposalsLayerNode::info() const
+{
+ return _info;
+}
+
+bool GenerateProposalsLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID)
+ && (output_id(2) != NullTensorID))
+ {
+ for(unsigned int i = 0; i < 3; ++i)
+ {
+ Tensor *dst = output(i);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(i);
+ }
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor GenerateProposalsLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_ERROR_ON(idx > 3);
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+ TensorDescriptor output_desc = src->desc();
+
+ switch(idx)
+ {
+ case 0:
+ // Configure proposals output
+ output_desc.shape = TensorShape(5, src->desc().shape.total_size());
+ break;
+ case 1:
+ // Configure scores_out output
+ output_desc.shape = TensorShape(src->desc().shape.total_size());
+ break;
+ case 2:
+ // Configure num_valid_proposals
+ output_desc.shape = TensorShape(1);
+ output_desc.data_type = DataType::U32;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported output index");
+ }
+ return output_desc;
+}
+
+NodeType GenerateProposalsLayerNode::type() const
+{
+ return NodeType::GenerateProposalsLayer;
+}
+
+void GenerateProposalsLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp b/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp
new file mode 100644
index 0000000..129b380
--- /dev/null
+++ b/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+NormalizePlanarYUVLayerNode::NormalizePlanarYUVLayerNode()
+{
+ _input_edges.resize(3, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+bool NormalizePlanarYUVLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor NormalizePlanarYUVLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ return src->desc();
+}
+
+NodeType NormalizePlanarYUVLayerNode::type() const
+{
+ return NodeType::NormalizePlanarYUVLayer;
+}
+
+void NormalizePlanarYUVLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/PadLayerNode.cpp b/src/graph/nodes/PadLayerNode.cpp
new file mode 100644
index 0000000..e7996d2
--- /dev/null
+++ b/src/graph/nodes/PadLayerNode.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/PadLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+PadLayerNode::PadLayerNode(PaddingList &padding)
+ : _padding(padding)
+{
+ _input_edges.resize(1, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+const PaddingList &PadLayerNode::padding() const
+{
+ return _padding;
+}
+
+bool PadLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor PadLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ TensorDescriptor output_desc = src->desc();
+ const TensorShape input_shape = src->desc().shape;
+ for(size_t dim = 0; dim < _padding.size(); ++dim)
+ {
+ output_desc.shape.set(dim, _padding[dim].first + input_shape[dim] + _padding[dim].second);
+ }
+
+ return output_desc;
+}
+
+NodeType PadLayerNode::type() const
+{
+ return NodeType::PadLayer;
+}
+
+void PadLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/PriorBoxLayerNode.cpp b/src/graph/nodes/PriorBoxLayerNode.cpp
new file mode 100644
index 0000000..edb1fba
--- /dev/null
+++ b/src/graph/nodes/PriorBoxLayerNode.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/PriorBoxLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+PriorBoxLayerNode::PriorBoxLayerNode(PriorBoxLayerInfo prior_info)
+ : _info(std::move(prior_info))
+{
+ _input_edges.resize(2, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+PriorBoxLayerInfo PriorBoxLayerNode::priorbox_info() const
+{
+ return _info;
+}
+
+TensorDescriptor PriorBoxLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+ const PriorBoxLayerInfo &info)
+{
+ const unsigned int layer_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+ const unsigned int layer_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+ const unsigned int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
+
+ TensorDescriptor output_descriptor = input_descriptor;
+ output_descriptor.shape.set(0, layer_width * layer_height * num_priors * 4);
+ output_descriptor.shape.set(1, 2);
+ output_descriptor.shape.set(2, 1);
+
+ return output_descriptor;
+}
+
+bool PriorBoxLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor PriorBoxLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *input0 = input(0);
+ ARM_COMPUTE_ERROR_ON(input0 == nullptr);
+
+ return compute_output_descriptor(input0->desc(), _info);
+}
+
+NodeType PriorBoxLayerNode::type() const
+{
+ return NodeType::PriorBoxLayer;
+}
+
+void PriorBoxLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/ROIAlignLayerNode.cpp b/src/graph/nodes/ROIAlignLayerNode.cpp
new file mode 100644
index 0000000..5e89ef2
--- /dev/null
+++ b/src/graph/nodes/ROIAlignLayerNode.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/graph/nodes/ROIAlignLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+ROIAlignLayerNode::ROIAlignLayerNode(ROIPoolingLayerInfo &pool_info)
+ : _pool_info(pool_info)
+{
+ _input_edges.resize(2, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+const ROIPoolingLayerInfo &ROIAlignLayerNode::pooling_info() const
+{
+ return _pool_info;
+}
+
+bool ROIAlignLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor ROIAlignLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ const Tensor *rois = input(1);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+ ARM_COMPUTE_ERROR_ON(rois == nullptr);
+
+ TensorDescriptor output_desc = src->desc();
+
+ const size_t idx_n = get_data_layout_dimension_index(output_desc.layout, DataLayoutDimension::BATCHES);
+ const size_t idx_c = get_data_layout_dimension_index(output_desc.layout, DataLayoutDimension::CHANNEL);
+ const size_t idx_h = get_data_layout_dimension_index(output_desc.layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_w = get_data_layout_dimension_index(output_desc.layout, DataLayoutDimension::WIDTH);
+
+ output_desc.shape.set(idx_n, rois->desc().shape[1]);
+ output_desc.shape.set(idx_c, src->desc().shape[idx_c]);
+ output_desc.shape.set(idx_h, _pool_info.pooled_height());
+ output_desc.shape.set(idx_w, _pool_info.pooled_width());
+
+ return output_desc;
+}
+
+NodeType ROIAlignLayerNode::type() const
+{
+ return NodeType::ROIAlignLayer;
+}
+
+void ROIAlignLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/ReorgLayerNode.cpp b/src/graph/nodes/ReorgLayerNode.cpp
new file mode 100644
index 0000000..6b83f6b
--- /dev/null
+++ b/src/graph/nodes/ReorgLayerNode.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ReorgLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+ReorgLayerNode::ReorgLayerNode(int stride)
+ : _stride(stride)
+{
+ _input_edges.resize(1, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+int ReorgLayerNode::stride() const
+{
+ return _stride;
+}
+
+TensorDescriptor ReorgLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, int stride)
+{
+ const unsigned int input_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+ const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+ const unsigned int input_channel = get_dimension_size(input_descriptor, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_ERROR_ON(stride <= 0);
+ ARM_COMPUTE_ERROR_ON_MSG((input_width % stride != 0), "The width of the input tensor must be a multiple of stride");
+ ARM_COMPUTE_ERROR_ON_MSG((input_height % stride != 0), "The height of the input tensor must be a multiple of stride");
+
+ TensorDescriptor output_descriptor = input_descriptor;
+ output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), input_width / stride);
+ output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), input_height / stride);
+ output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::CHANNEL), input_channel * stride * stride);
+
+ return output_descriptor;
+}
+
+bool ReorgLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor ReorgLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ return compute_output_descriptor(src->desc(), _stride);
+}
+
+NodeType ReorgLayerNode::type() const
+{
+ return NodeType::ReorgLayer;
+}
+
+void ReorgLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/SliceLayerNode.cpp b/src/graph/nodes/SliceLayerNode.cpp
new file mode 100644
index 0000000..3a29e4c
--- /dev/null
+++ b/src/graph/nodes/SliceLayerNode.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/SliceLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+SliceLayerNode::SliceLayerNode(Coordinates &starts, Coordinates &ends)
+ : _starts(starts), _ends(ends)
+{
+ _input_edges.resize(1, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+Coordinates SliceLayerNode::starts() const
+{
+ return _starts;
+}
+
+Coordinates SliceLayerNode::ends() const
+{
+ return _ends;
+}
+
+TensorDescriptor SliceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+ const Coordinates &starts, const Coordinates &ends)
+{
+ // Get absolute end coordinates
+ const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input_descriptor.shape, ends);
+
+ TensorDescriptor output_descriptor = input_descriptor;
+ for(unsigned int i = 0; i < starts.num_dimensions(); ++i)
+ {
+ output_descriptor.shape.set(i, ends_abs[i] - starts[i]);
+ }
+
+ return output_descriptor;
+}
+
+bool SliceLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor SliceLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ return compute_output_descriptor(src->desc(), _starts, _ends);
+}
+
+NodeType SliceLayerNode::type() const
+{
+ return NodeType::SliceLayer;
+}
+
+void SliceLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/UpsampleLayerNode.cpp b/src/graph/nodes/UpsampleLayerNode.cpp
new file mode 100644
index 0000000..bdd39e8
--- /dev/null
+++ b/src/graph/nodes/UpsampleLayerNode.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/UpsampleLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+UpsampleLayerNode::UpsampleLayerNode(Size2D info, InterpolationPolicy upsampling_policy)
+ : _info(info), _upsampling_policy(upsampling_policy)
+{
+ _input_edges.resize(1, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+Size2D UpsampleLayerNode::info() const
+{
+ return _info;
+}
+
+InterpolationPolicy UpsampleLayerNode::upsampling_policy() const
+{
+ return _upsampling_policy;
+}
+
+TensorDescriptor UpsampleLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+ Size2D info)
+{
+ const unsigned int input_width = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+ const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+
+ TensorDescriptor output_descriptor = input_descriptor;
+ output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), input_width * info.x());
+ output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), input_height * info.y());
+
+ return output_descriptor;
+}
+
+bool UpsampleLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor UpsampleLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ return compute_output_descriptor(src->desc(), _info);
+}
+
+NodeType UpsampleLayerNode::type() const
+{
+ return NodeType::UpsampleLayer;
+}
+
+void UpsampleLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/YOLOLayerNode.cpp b/src/graph/nodes/YOLOLayerNode.cpp
new file mode 100644
index 0000000..cf1e576
--- /dev/null
+++ b/src/graph/nodes/YOLOLayerNode.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/YOLOLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+YOLOLayerNode::YOLOLayerNode(ActivationLayerInfo act_info, int32_t num_classes)
+ : _act_info(act_info), _num_classes(num_classes)
+{
+ _input_edges.resize(1, EmptyEdgeID);
+ _outputs.resize(1, NullTensorID);
+}
+
+ActivationLayerInfo YOLOLayerNode::activation_info() const
+{
+ return _act_info;
+}
+
+int32_t YOLOLayerNode::num_classes() const
+{
+ return _num_classes;
+}
+
+bool YOLOLayerNode::forward_descriptors()
+{
+ if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+ {
+ Tensor *dst = output(0);
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+ dst->desc() = configure_output(0);
+ return true;
+ }
+ return false;
+}
+
+TensorDescriptor YOLOLayerNode::configure_output(size_t idx) const
+{
+ ARM_COMPUTE_UNUSED(idx);
+ ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+ const Tensor *src = input(0);
+ ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+ return src->desc();
+}
+
+NodeType YOLOLayerNode::type() const
+{
+ return NodeType::YOLOLayer;
+}
+
+void YOLOLayerNode::accept(INodeVisitor &v)
+{
+ v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
index 29505e5..e09451c 100644
--- a/src/runtime/BlobMemoryPool.cpp
+++ b/src/runtime/BlobMemoryPool.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -52,7 +52,7 @@
for(auto &handle : handles)
{
ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
- *handle.first = _blobs[handle.second];
+ handle.first->set_region(_blobs[handle.second].get());
}
}
@@ -61,7 +61,7 @@
for(auto &handle : handles)
{
ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
- *handle.first = nullptr;
+ handle.first->set_region(nullptr);
}
}
@@ -82,17 +82,11 @@
for(const auto &size : sizes)
{
- _blobs.push_back(_allocator->allocate(size, 0));
+ _blobs.push_back(_allocator->make_region(size, 0));
}
}
void BlobMemoryPool::free_blobs()
{
- ARM_COMPUTE_ERROR_ON(!_allocator);
-
- for(auto &blob : _blobs)
- {
- _allocator->free(blob);
- }
_blobs.clear();
}
\ No newline at end of file
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index bbc513d..5bea85c 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp
@@ -24,23 +24,20 @@
#include "arm_compute/runtime/CL/CLMemory.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/Cast.h"
namespace arm_compute
{
CLMemory::CLMemory()
: _region(nullptr), _region_owned(nullptr)
{
- create_empty_region();
}
CLMemory::CLMemory(std::shared_ptr<ICLMemoryRegion> memory)
: _region(nullptr), _region_owned(std::move(memory))
{
- if(_region_owned == nullptr)
- {
- create_empty_region();
- }
- _region = _region_owned.get();
+ _region_owned = memory;
+ _region = _region_owned.get();
}
CLMemory::CLMemory(ICLMemoryRegion *memory)
@@ -49,19 +46,36 @@
_region = memory;
}
-ICLMemoryRegion *CLMemory::region()
+ICLMemoryRegion *CLMemory::cl_region()
{
return _region;
}
-ICLMemoryRegion *CLMemory::region() const
+ICLMemoryRegion *CLMemory::cl_region() const
{
return _region;
}
-void CLMemory::create_empty_region()
+IMemoryRegion *CLMemory::region()
{
- _region_owned = std::make_shared<CLBufferMemoryRegion>(cl::Context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0);
+ return _region;
+}
+
+IMemoryRegion *CLMemory::region() const
+{
+ return _region;
+}
+
+void CLMemory::set_region(IMemoryRegion *region)
+{
+ auto cl_region = utils::cast::polymorphic_downcast<ICLMemoryRegion *>(region);
+ _region_owned = nullptr;
+ _region = cl_region;
+}
+
+void CLMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region)
+{
+ _region_owned = utils::cast::polymorphic_downcast_unique_ptr<ICLMemoryRegion>(std::move(region));
_region = _region_owned.get();
}
} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
index 15fd7f3..9578d73 100644
--- a/src/runtime/CL/CLMemoryRegion.cpp
+++ b/src/runtime/CL/CLMemoryRegion.cpp
@@ -48,9 +48,10 @@
return _mapping;
}
-void **ICLMemoryRegion::handle()
+std::unique_ptr<IMemoryRegion> ICLMemoryRegion::extract_subregion(size_t offset, size_t size)
{
- return reinterpret_cast<void **>(&_mem);
+ ARM_COMPUTE_UNUSED(offset, size);
+ return nullptr;
}
CLBufferMemoryRegion::CLBufferMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size)
@@ -62,6 +63,12 @@
}
}
+CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer)
+ : ICLMemoryRegion(buffer.getInfo<CL_MEM_CONTEXT>(), buffer.getInfo<CL_MEM_SIZE>())
+{
+ _mem = buffer;
+}
+
void *CLBufferMemoryRegion::ptr()
{
return nullptr;
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index dd716f7..0307498 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -28,86 +28,87 @@
#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-using namespace arm_compute;
+namespace arm_compute
+{
+const cl::Buffer CLTensorAllocator::_empty_buffer = cl::Buffer();
namespace
{
-std::shared_ptr<arm_compute::ICLMemoryRegion> allocate_region(cl::Context context, size_t size, cl_uint alignment)
+std::unique_ptr<ICLMemoryRegion> allocate_region(cl::Context context, size_t size, cl_uint alignment)
{
// Try fine-grain SVM
- std::shared_ptr<ICLMemoryRegion> region = std::make_shared<CLFineSVMMemoryRegion>(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment);
+ std::unique_ptr<ICLMemoryRegion> region = support::cpp14::make_unique<CLFineSVMMemoryRegion>(context,
+ CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
+ size,
+ alignment);
// Try coarse-grain SVM in case of failure
if(region != nullptr && region->ptr() == nullptr)
{
- region = std::make_shared<CLCoarseSVMMemoryRegion>(context, CL_MEM_READ_WRITE, size, alignment);
+ region = support::cpp14::make_unique<CLCoarseSVMMemoryRegion>(context, CL_MEM_READ_WRITE, size, alignment);
}
// Try legacy buffer memory in case of failure
if(region != nullptr && region->ptr() == nullptr)
{
- region = std::make_shared<CLBufferMemoryRegion>(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+ region = support::cpp14::make_unique<CLBufferMemoryRegion>(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
}
return region;
}
} // namespace
CLTensorAllocator::CLTensorAllocator(CLTensor *owner)
- : _associated_memory_group(nullptr), _memory(), _owner(owner)
+ : _associated_memory_group(nullptr), _memory(), _mapping(nullptr), _owner(owner)
{
}
uint8_t *CLTensorAllocator::data()
{
- ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
- return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
+ return _mapping;
}
const cl::Buffer &CLTensorAllocator::cl_data() const
{
- ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
- return _memory.region()->cl_data();
+ return _memory.region() == nullptr ? _empty_buffer : _memory.cl_region()->cl_data();
}
void CLTensorAllocator::allocate()
{
- ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-
if(_associated_memory_group == nullptr)
{
- if(_memory.region()->cl_data().get() != nullptr)
+ if(_memory.region() != nullptr && _memory.cl_region()->cl_data().get() != nullptr)
{
// Memory is already allocated. Reuse it if big enough, otherwise fire an assertion
- ARM_COMPUTE_ERROR_ON_MSG(info().total_size() > _memory.region()->size(), "Reallocation of a bigger memory region is not allowed!");
+ ARM_COMPUTE_ERROR_ON_MSG(info().total_size() > _memory.region()->size(),
+ "Reallocation of a bigger memory region is not allowed!");
}
else
{
// Perform memory allocation
- _memory = CLMemory(allocate_region(CLScheduler::get().context(), info().total_size(), 0));
+ _memory.set_owned_region(allocate_region(CLScheduler::get().context(), info().total_size(), 0));
}
}
else
{
- _associated_memory_group->finalize_memory(_owner, _memory.region()->handle(), info().total_size());
- _memory.region()->set_size(info().total_size());
+ _associated_memory_group->finalize_memory(_owner, _memory, info().total_size());
}
info().set_is_resizable(false);
}
void CLTensorAllocator::free()
{
- if(_associated_memory_group == nullptr)
- {
- _memory = CLMemory();
- info().set_is_resizable(true);
- }
+ _mapping = nullptr;
+ _memory.set_region(nullptr);
+ info().set_is_resizable(true);
}
-arm_compute::Status CLTensorAllocator::import_memory(CLMemory memory)
+arm_compute::Status CLTensorAllocator::import_memory(cl::Buffer buffer)
{
- ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
- ARM_COMPUTE_RETURN_ERROR_ON(memory.region()->cl_data().get() == nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON(buffer.get() == nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_SIZE>() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_CONTEXT>().get() != CLScheduler::get().context().get());
ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
- _memory = memory;
+
+ _memory.set_owned_region(support::cpp14::make_unique<CLBufferMemoryRegion>(buffer));
info().set_is_resizable(false);
return Status{};
@@ -115,11 +116,10 @@
void CLTensorAllocator::set_associated_memory_group(CLMemoryGroup *associated_memory_group)
{
- ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
- ARM_COMPUTE_ERROR_ON(_memory.region()->cl_data().get() != nullptr);
- _memory = CLMemory(std::make_shared<CLBufferMemoryRegion>(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0));
+ ARM_COMPUTE_ERROR_ON(_memory.region() != nullptr && _memory.cl_region()->cl_data().get() != nullptr);
+
_associated_memory_group = associated_memory_group;
}
@@ -136,16 +136,23 @@
uint8_t *CLTensorAllocator::map(cl::CommandQueue &q, bool blocking)
{
+ ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
- _memory.region()->map(q, blocking);
- return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
+
+ _mapping = reinterpret_cast<uint8_t *>(_memory.cl_region()->map(q, blocking));
+ return _mapping;
}
void CLTensorAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping)
{
- ARM_COMPUTE_UNUSED(mapping);
+ ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+ ARM_COMPUTE_ERROR_ON(_mapping != mapping);
ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() == nullptr);
- _memory.region()->unmap(q);
+ ARM_COMPUTE_UNUSED(mapping);
+
+ _memory.cl_region()->unmap(q);
+ _mapping = nullptr;
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
index 5fca30c..e661f6a 100644
--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+++ b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
+#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
#include "support/ToolchainSupport.h"
@@ -30,11 +31,21 @@
using namespace arm_compute;
-void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
{
auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionKernel>();
k->configure(input1, input2, output, policy);
_kernel = std::move(k);
+
+ if(output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
}
Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
new file mode 100644
index 0000000..7919b13
--- /dev/null
+++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLBatchToSpaceLayer::CLBatchToSpaceLayer()
+ : _batch_to_space_kernel()
+{
+}
+
+void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+{
+ _batch_to_space_kernel.configure(input, block_shape, output);
+}
+
+void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output)
+{
+ _batch_to_space_kernel.configure(input, block_shape_x, block_shape_y, output);
+}
+
+Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+{
+ return CLBatchToSpaceLayerKernel::validate(input, block_shape, output);
+}
+
+Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output)
+{
+ return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output);
+}
+
+void CLBatchToSpaceLayer::run()
+{
+ CLScheduler::get().enqueue(_batch_to_space_kernel, true);
+}
diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
new file mode 100644
index 0000000..46a6b8e
--- /dev/null
+++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
+
+#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+{
+ // Configure Bounding Box kernel
+ auto k = arm_compute::support::cpp14::make_unique<CLBoundingBoxTransformKernel>();
+ k->configure(boxes, pred_boxes, deltas, info);
+ _kernel = std::move(k);
+}
+
+Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+{
+ return CLBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
new file mode 100644
index 0000000..409d3c9
--- /dev/null
+++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLComputeAllAnchors.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLComputeAllAnchors::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+{
+ // Configure ComputeAllAnchors kernel
+ auto k = arm_compute::support::cpp14::make_unique<CLComputeAllAnchorsKernel>();
+ k->configure(anchors, all_anchors, info);
+ _kernel = std::move(k);
+}
+
+Status CLComputeAllAnchors::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+{
+ return CLComputeAllAnchorsKernel::validate(anchors, all_anchors, info);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 40562b5..e07feb2 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -27,6 +27,8 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
#include <memory>
#include <tuple>
@@ -38,7 +40,10 @@
: _memory_group(std::move(memory_manager)),
_scale_f(),
_conv_f(),
+ _flip_weights(),
_scaled_output(),
+ _original_weights(nullptr),
+ _weights_flipped(),
_is_prepared(false)
{
}
@@ -47,9 +52,17 @@
unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+ const DataLayout data_layout = input->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
const unsigned int stride_x = info.stride().first;
@@ -58,24 +71,34 @@
ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_right > stride_x - 1, "inner_border_right must be smaller than stride_x");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y");
- auto out_dims = deconvolution_output_dimensions(input->dimension(0), input->dimension(1), weights->dimension(0), weights->dimension(1),
- info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
+ auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
+ info.pad().first, info.pad().second, stride_x, stride_y);
- const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape());
+ const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
if(bias != nullptr)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ if(is_data_type_quantized_asymmetric(input->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], "Output's width is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
- TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_deconvolution_shape(*input, stride_x, stride_y, inner_border_right, inner_border_top,
- info)));
+ unsigned int padx = 0;
+ unsigned int pady = 0;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
+ TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
@@ -84,7 +107,7 @@
return Status{};
}
-void CLDeconvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -92,36 +115,46 @@
const unsigned int stride_x = info.stride().first;
const unsigned int stride_y = info.stride().second;
- auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
- info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
+ const DataLayout data_layout = input->info()->data_layout();
- const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ _original_weights = weights;
+ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+ _flip_weights.configure(weights, &_weights_flipped);
+
+ auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h),
+ info.pad().first, info.pad().second, stride_x, stride_y);
+
+ const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
// Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
- _is_prepared = false;
+ _is_prepared = weights_info.retain_internal_weights();
_memory_group.manage(&_scaled_output);
- // configure scale function
- // Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor
- TensorShape scale_out_shape(input->info()->tensor_shape());
- const unsigned int out_x = input->info()->dimension(0) + (input->info()->dimension(0) - 1) * (stride_x - 1) + inner_border_right + 2 * info.pad().first;
- const unsigned int out_y = input->info()->dimension(1) + (input->info()->dimension(1) - 1) * (stride_y - 1) + inner_border_top + 2 * info.pad().second;
- scale_out_shape.set(0, out_x);
- scale_out_shape.set(1, out_y);
- TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type());
+ // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
+ unsigned int padx = 0;
+ unsigned int pady = 0;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
+
+ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+ scale_out_info.set_data_layout(data_layout);
_scaled_output.allocator()->init(scale_out_info);
- _scale_f.configure(input, &_scaled_output, BorderSize(inner_border_top, inner_border_right), info);
+ // configure scale function
+ const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
+ _scale_f.configure(input, &_scaled_output, BorderSize(inner_border_top, inner_border_right), upsample_info);
// setup the function to convolve the upscaled output
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
- _conv_f.configure(&_scaled_output, weights, bias, output, conv_info, weights_info);
+ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
_scaled_output.allocator()->allocate();
}
@@ -141,7 +174,25 @@
{
if(!_is_prepared)
{
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights flipping and mark original weights tensor as unused
+ _weights_flipped.allocator()->allocate();
+ _weights_flipped.map(true);
+ _original_weights->map(CLScheduler::get().queue(), true);
+ CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
+ _weights_flipped.unmap();
+ _original_weights->unmap(CLScheduler::get().queue());
+ _original_weights->mark_as_unused();
+
+ // Prepare convolution
_conv_f.prepare();
+
+ if(!_weights_flipped.is_used())
+ {
+ _weights_flipped.allocator()->free();
+ }
+
_is_prepared = true;
}
}
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index 13a24f8..ce8667d 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -57,7 +57,15 @@
void CLDeconvolutionLayerUpsample::run()
{
_output->map(CLScheduler::get().queue(), true);
- memset(_output->buffer(), 0, _output->info()->total_size());
+ if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
+ {
+ const uint8_t quantized_zero = _output->info()->quantization_info().offset;
+ std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
+ }
+ else
+ {
+ memset(_output->buffer(), 0, _output->info()->total_size());
+ }
_output->unmap(CLScheduler::get().queue());
CLScheduler::get().enqueue(_upsample, false);
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 76451af..497cdae 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -90,12 +90,13 @@
}
CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
- : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
- _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _original_weights(nullptr)
+ : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _activationlayer_function(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(),
+ _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
{
}
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -188,10 +189,18 @@
// Allocate intermediate tensors
_input_reshaped.allocator()->allocate();
_v2mm_output.allocator()->allocate();
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
}
Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier)
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
{
const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
@@ -238,6 +247,12 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
}
+ // Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ }
+
return Status{};
}
@@ -253,6 +268,10 @@
{
CLScheduler::get().enqueue(_output_stage_kernel);
}
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
}
void CLDepthwiseConvolutionLayer::prepare()
diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
index 45f70d2..a0663b7 100644
--- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp
+++ b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
@@ -68,7 +68,7 @@
}
else
{
- const float diff = image_size - 1;
+ const float diff = image_size - num_lowest_pixels;
for(size_t i = 0; i < 256; ++i)
{
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 364db34..4137071 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,11 +26,17 @@
#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
void CLFloor::configure(const ICLTensor *input, ICLTensor *output)
{
auto k = arm_compute::support::cpp14::make_unique<CLFloorKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
+
+Status CLFloor::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLFloorKernel::validate(input, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 010985d..6a2aac6 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -49,6 +49,7 @@
// Validate gemmlowp function
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info),
&weights.clone()->set_quantization_info(weights_quantization_info),
+ nullptr,
&output));
}
else
@@ -91,7 +92,7 @@
weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
// Configure gemmlowp function
- _mm_gemmlowp.configure(input, weights, output);
+ _mm_gemmlowp.configure(input, weights, nullptr, output);
// Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
input->info()->set_quantization_info(input_quantization_info);
@@ -100,7 +101,7 @@
else
{
// Configure matrix multiply kernel
- _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */, 1, false, retain_internal_weights));
+ _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */, 0, false, retain_internal_weights));
}
}
diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
new file mode 100644
index 0000000..32e4678
--- /dev/null
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLFuseBatchNormalization::CLFuseBatchNormalization()
+ : _fuse_bn_kernel()
+{
+}
+
+void CLFuseBatchNormalization::configure(const ICLTensor *conv_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
+ ICLTensor *fused_weights, ICLTensor *fused_bias,
+ const ICLTensor *conv_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
+ float epsilon)
+{
+ _fuse_bn_kernel.configure(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+}
+
+Status CLFuseBatchNormalization::validate(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+ const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+ float epsilon)
+{
+ return CLFuseBatchNormalizationKernel::validate(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+}
+
+void CLFuseBatchNormalization::run()
+{
+ CLScheduler::get().enqueue(_fuse_bn_kernel, true);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index f16d1c0..baa0cf4 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -44,8 +44,9 @@
{
bool flag = true;
- if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
+ if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
{
+ // COMPMID-852
if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
{
constexpr float alpha = 3.2f;
@@ -71,8 +72,18 @@
} // namespace
CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false),
- _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
+ : _memory_group(std::move(memory_manager)),
+ _interleave_kernel(),
+ _transpose_kernel(),
+ _mm_kernel(),
+ _ma_kernel(),
+ _tmp_a(),
+ _tmp_b(),
+ _original_b(nullptr),
+ _is_interleaved_transposed(false),
+ _run_addition(false),
+ _reshape_b_only_on_first_run(false),
+ _is_prepared(false)
{
}
@@ -122,10 +133,7 @@
if(_is_interleaved_transposed)
{
reinterpret_input_as_3d = false;
- }
- if(_is_interleaved_transposed)
- {
matrix_a = &_tmp_a;
matrix_b = &_tmp_b;
@@ -145,8 +153,10 @@
}
// Configure and tune matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d,
- reinterpret_input_as_3d));
+ _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
+ mult_transpose1xW_width, mult_interleave4x4_height,
+ depth_output_gemm3d, reinterpret_input_as_3d),
+ gemm_info.fp_mixed_precision());
CLScheduler::get().tune_kernel_static(_mm_kernel);
if(_is_interleaved_transposed)
@@ -227,7 +237,7 @@
}
// Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
if(beta != 0 && c != nullptr)
{
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 92d04d6..4694aa7 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -91,16 +91,21 @@
}
CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
- _add_bias_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false),
- _skip_im2col(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
+ : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
+ _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false), _skip_col2im(false), _is_quantized(false),
+ _is_activationlayer_enabled(false), _is_prepared(false)
{
}
-void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, int gemm_3d_depth)
+void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+ int gemm_3d_depth)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
- ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info(), gemm_3d_depth, _skip_im2col));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col));
+
+ const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+ gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
+ false, gemmlowp_output_stage);
if(_is_quantized)
{
@@ -112,7 +117,7 @@
input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
- _mm_gemmlowp.configure(input, weights, output, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+ _mm_gemmlowp.configure(input, weights, biases, output, gemm_info);
// Revert back QuantizatioInfo as input and weights could be used in other convolution layers
input->info()->set_quantization_info(input_quantization_info);
@@ -121,16 +126,19 @@
else
{
// Configure matrix multiply function
- _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, gemm_3d_depth,
- _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */));
+ _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
}
}
-Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int gemm_3d_depth, bool skip_im2col)
+Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+ const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col)
{
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
+ const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+ gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
+ false, gemmlowp_output_stage);
+
if(is_quantized)
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -144,7 +152,7 @@
weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
// Perform validation step on GEMMLowp
- return CLGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), output, gemm_info);
+ return CLGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, gemm_info);
}
else
{
@@ -177,21 +185,21 @@
const unsigned int kernel_width = weights->info()->dimension(idx_width);
const unsigned int kernel_height = weights->info()->dimension(idx_height);
- _is_prepared = weights_info.retain_internal_weights();
- _original_weights = weights;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _data_layout = data_layout;
- _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1) && !_is_quantized;
- _append_bias = (biases != nullptr) && (!_is_quantized);
+ _is_prepared = weights_info.retain_internal_weights();
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _data_layout = data_layout;
+ _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+ _skip_col2im = data_layout == DataLayout::NHWC;
+ _append_bias = (biases != nullptr) && (!_is_quantized);
+ _is_activationlayer_enabled = act_info.enabled();
// Set the GPU target for im2col and col2im
_im2col_kernel.set_target(CLScheduler::get().target());
_col2im_kernel.set_target(CLScheduler::get().target());
- bool is_nhwc = _data_layout == DataLayout::NHWC;
- const ICLTensor *gemm_input_to_use = input;
- ICLTensor *gemm_output_to_use = output;
- ICLTensor *gemm_output_staged_to_use = output;
+ const ICLTensor *gemm_input_to_use = input;
+ ICLTensor *gemm_output_to_use = output;
const ICLTensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr;
@@ -238,17 +246,18 @@
}
// Create GEMM output tensor
- if(!is_nhwc || _is_quantized)
+ if(!_skip_col2im)
{
- // Calculate GEMM output shape
- TensorShape shape_gemm = _im2col_output.info()->tensor_shape();
+ TensorShape shape_gemm;
+
+ // If we cannot skip col2im it means we run im2col as well
+ shape_gemm = _im2col_output.info()->tensor_shape();
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, conv_w * conv_h);
- // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
- const DataType gemm_data_type = _is_quantized ? DataType::S32 : data_type;
- TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
- info_gemm.set_quantization_info(output->info()->quantization_info());
+ // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
+ TensorInfo info_gemm(shape_gemm, 1, data_type);
+ info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
_gemm_output.allocator()->init(info_gemm);
_memory_group.manage(&_gemm_output);
@@ -256,56 +265,76 @@
gemm_output_to_use = &_gemm_output;
}
- // Configure and tune GEMM
- configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, (data_layout == DataLayout::NHWC) ? conv_h : 1);
-
- if(!_skip_im2col)
- {
- _im2col_output.allocator()->allocate();
- }
+ GEMMLowpOutputStageInfo gemmlowp_output_stage;
+ gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ gemmlowp_output_stage.gemmlowp_offset = 0;
+ gemmlowp_output_stage.gemmlowp_multiplier = 0;
+ gemmlowp_output_stage.gemmlowp_shift = 0;
// Configure output stage for quantized case
if(_is_quantized)
{
const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
- float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
+ const float multiplier = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _memory_group.manage(&_tmp_output);
- gemm_output_staged_to_use = &_tmp_output;
+ int min_activation = 0;
+ int max_activation = 0;
- _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset);
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+
+ if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+ {
+ const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+ const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+ max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+
+ // If the activation layer is RELU, BOUNDED_RELU or LU_BOUNDED_RELU, we can use the GEMMLowp output stage to perform this operation
+ _is_activationlayer_enabled = false;
+ }
+
+ // Set the GEMMLowp output stage info
+ gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
+ gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
+ gemmlowp_output_stage.gemmlowp_shift = output_shift;
+ gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
+ gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
}
- if(!is_nhwc || _is_quantized)
+ // Configure and tune GEMM
+ // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
+ const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
+
+ configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth);
+
+ if(!_skip_im2col)
{
- if(input->info()->data_layout() == DataLayout::NCHW)
- {
- // Configure and tune Col2Im
- _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, std::make_pair(conv_w, conv_h), num_groups);
- CLScheduler::get().tune_kernel_static(_col2im_kernel);
- }
- else
- {
- // Configure reshape layer
- _reshape_layer.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output);
- }
+ _im2col_output.allocator()->allocate();
}
- if(!is_nhwc || _is_quantized)
+ if(!_skip_col2im)
{
- _tmp_output.allocator()->allocate();
+ // Configure and tune Col2Im
+ _col2im_kernel.configure(gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups);
+ CLScheduler::get().tune_kernel_static(_col2im_kernel);
+ }
+
+ if(!_skip_col2im)
+ {
_gemm_output.allocator()->allocate();
}
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
"Output shape does not match the expected one");
- //Configure Activation Layer
- _is_activationlayer_enabled = act_info.enabled();
-
if(_is_activationlayer_enabled)
{
_activationlayer_function.configure(output, nullptr, act_info);
@@ -336,16 +365,16 @@
const unsigned int kernel_width = weights->dimension(idx_width);
const unsigned int kernel_height = weights->dimension(idx_height);
- TensorInfo im2col_reshaped_info, info_gemm, tmp_info, weights_reshaped_info;
- const ITensorInfo *gemm_input_to_use = input;
- const ITensorInfo *gemm_output_to_use = output;
- const ITensorInfo *gemm_output_staged_to_use = output;
- const ITensorInfo *weights_to_use = weights;
+ TensorInfo im2col_reshaped_info, info_gemm, weights_reshaped_info;
+ const ITensorInfo *gemm_input_to_use = input;
+ const ITensorInfo *gemm_output_to_use = output;
+ const ITensorInfo *weights_to_use = weights;
- const bool is_nhwc = data_layout == DataLayout::NHWC;
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1) && !is_quantized;
- const bool append_bias = (biases != nullptr) && (!is_quantized);
+ const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+ const bool append_bias = (biases != nullptr) && (!is_quantized);
+ const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+ const bool skip_col2im = data_layout == DataLayout::NHWC;
+ bool is_activationlayer_enabled = act_info.enabled();
ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -407,47 +436,76 @@
}
// Create GEMM output tensor
- if(!is_nhwc || is_quantized)
+ if(!skip_col2im)
{
- TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
+ TensorShape shape_gemm;
+
+ shape_gemm = gemm_input_to_use->tensor_shape();
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, conv_w * conv_h);
- const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type;
- // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
- info_gemm = TensorInfo(shape_gemm, 1, gemm_data_type);
- info_gemm.set_quantization_info(output->quantization_info());
+
+ info_gemm = TensorInfo(shape_gemm, 1, data_type);
+ info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
gemm_output_to_use = &info_gemm;
}
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, (data_layout == DataLayout::NHWC) ? conv_h : 1, skip_im2col));
+ GEMMLowpOutputStageInfo gemmlowp_output_stage;
+ gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ gemmlowp_output_stage.gemmlowp_offset = 0;
+ gemmlowp_output_stage.gemmlowp_multiplier = 0;
+ gemmlowp_output_stage.gemmlowp_shift = 0;
if(is_quantized)
{
- float multiplier = input->quantization_info().scale * weights_to_use->quantization_info().scale / output->quantization_info().scale;
- int output_multiplier, output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input->quantization_info() : output->quantization_info();
- tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8);
- tmp_info.set_quantization_info(output->quantization_info());
- gemm_output_staged_to_use = &tmp_info;
+ const float multiplier = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
- // Validate output stage for quantized case
- CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, output->quantization_info().offset);
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
+
+ int min_activation = 0;
+ int max_activation = 0;
+
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+
+ if(is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+ {
+ const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+ const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+ max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+
+ // If the activation layer is RELU, BOUNDED_RELU or LU_BOUNDED_RELU, we can use the GEMMLowp output stage to perform this operation
+ is_activationlayer_enabled = false;
+ }
+
+ // Set the GEMMLowp output stage info
+ gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
+ gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
+ gemmlowp_output_stage.gemmlowp_shift = output_shift;
+ gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
+ gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
}
+ // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
+ const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
+
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col));
+
// Validate Col2Im
- if(!is_nhwc || is_quantized)
+ if(!skip_col2im)
{
- if(input->data_layout() == DataLayout::NCHW)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(is_quantized ? gemm_output_staged_to_use : gemm_output_to_use,
- output,
- std::make_pair(conv_w, conv_h), num_groups));
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups));
}
//Validate Activation Layer
- if(act_info.enabled())
+ if(is_activationlayer_enabled)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
}
@@ -472,9 +530,6 @@
{
// Run gemmlowp
_mm_gemmlowp.run();
-
- // Run output stage
- _gemmlowp_output_stage.run();
}
else
{
@@ -488,16 +543,9 @@
}
// Reshape output matrix
- if(_data_layout == DataLayout::NCHW || _is_quantized)
+ if(!_skip_col2im)
{
- if(_data_layout == DataLayout::NCHW)
- {
- CLScheduler::get().enqueue(_col2im_kernel, false);
- }
- else
- {
- _reshape_layer.run();
- }
+ CLScheduler::get().enqueue(_col2im_kernel, false);
}
//Run Activation Layer if enabled
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 0ce07c3..2d4d231 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -41,8 +41,11 @@
{
bool flag = true;
- if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
+ if(gpu_target_is_in(gpu_target,
+ GPUTarget::G71, GPUTarget::G72,
+ GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT))
{
+ // COMPMID-852
if(k > 256 && m > 4 && reshape_b_only_on_first_run)
{
flag = ((0.72f + n * 0.10766f) < (n * 0.1284f));
@@ -52,6 +55,10 @@
flag = false;
}
}
+ else
+ {
+ flag = m > 1;
+ }
return flag;
}
@@ -65,24 +72,26 @@
_mtx_a_reduction_kernel(),
_mtx_b_reduction_kernel(),
_offset_contribution_kernel(),
+ _offset_contribution_output_stage_kernel(),
_vector_sum_col(),
_vector_sum_row(),
_tmp_a(),
_tmp_b(),
+ _mm_result_s32(),
_original_b(nullptr),
_a_offset(0),
_b_offset(0),
_is_interleaved_transposed(true),
_reshape_b_only_on_first_run(false),
- _is_prepared(false)
+ _is_prepared(false),
+ _fuse_output_stage(false)
{
}
-void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- ARM_COMPUTE_UNUSED(gemm_info);
- ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
_is_prepared = false;
_original_b = b;
@@ -103,9 +112,12 @@
// Arguments used by GEMMReshapeInfo
// If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
// in order to know how the matrices have been reshaped
- const int m = a->info()->dimension(1);
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const bool unroll_block = dot8_supported(CLKernelLibrary::get().get_device());
+ const int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
const int n = b->info()->dimension(0);
const int k = a->info()->dimension(0);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
constexpr int mult_transpose1xW_width = 1;
constexpr int mult_interleave4x4_height = 1;
@@ -114,6 +126,9 @@
if(_is_interleaved_transposed)
{
+ // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
+ reinterpret_input_as_3d = false;
+
matrix_a = &_tmp_a;
matrix_b = &_tmp_b;
@@ -124,15 +139,12 @@
}
// Configure interleave kernel
- _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
+ _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d(), unroll_block);
// Configure transpose kernel
_mtx_b_reshape_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
}
- // Configure matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
-
// Initialize matrix B reduction kernel only if _a_offset is not equal to 0
if(_a_offset != 0)
{
@@ -158,8 +170,34 @@
_mtx_a_reduction_kernel.configure(a, &_vector_sum_row);
}
- // Configure offset contribution kernel
- _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+ // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+ if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ {
+ _fuse_output_stage = true;
+
+ _memory_group.manage(&_mm_result_s32);
+
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
+ mult_transpose1xW_width, mult_interleave4x4_height,
+ depth_output_gemm3d, reinterpret_input_as_3d));
+
+ // Configure offset contribution kernel
+ _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
+ _a_offset, _b_offset, gemm_info.gemmlowp_output_stage());
+
+ _mm_result_s32.allocator()->allocate();
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
+ mult_transpose1xW_width, mult_interleave4x4_height,
+ depth_output_gemm3d, reinterpret_input_as_3d));
+
+ // Configure offset contribution kernel
+ _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset);
+ }
// Allocate tensors
if(_is_interleaved_transposed)
@@ -182,45 +220,52 @@
}
}
-Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
- "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),
- "The output matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
- "The output matrix must have the same number of columns as the matrix B");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
int32_t a_offset = a->quantization_info().offset;
int32_t b_offset = b->quantization_info().offset;
- const int m = a->dimension(1);
- const int n = b->dimension(0);
- const int k = a->dimension(0);
- constexpr int mult_transpose1xW_width = 1;
- constexpr int mult_interleave4x4_height = 1;
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- const GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d);
+ const ITensorInfo *matrix_a_info = a;
+ const ITensorInfo *matrix_b_info = b;
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const int n = b->dimension(0);
+ const int k = a->dimension(0);
+ constexpr int mult_transpose1xW_width = 1;
+ constexpr int mult_interleave4x4_height = 1;
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
bool reshape_matrices = is_interleaved_transposed(m, n, k, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
+ // if reshape_matrices is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
if(reshape_matrices)
{
- TensorInfo info_a(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()), 1, a->data_type());
- TensorInfo info_b(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width), 1, b->data_type());
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b, mult_transpose1xW_width));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output, reshape_matrices, reshape_info));
+ reinterpret_input_as_3d = false;
}
- else
+
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
+
+ if(reshape_matrices)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(a, b, output, reshape_matrices, reshape_info));
+ matrix_a_info = &tmp_a_info;
+ matrix_b_info = &tmp_b_info;
+
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
+
+ // Validate transpose kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
}
TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -243,11 +288,37 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row));
}
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- a_offset, b_offset));
+ if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ {
+ TensorInfo mm_result_s32_info{};
+
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_matrices, reshape_info)).set_data_type(DataType::S32));
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_matrices, reshape_info));
+
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ c,
+ output,
+ a_offset, b_offset,
+ gemm_info.gemmlowp_output_stage()));
+ }
+ else
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_matrices, reshape_info));
+
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ c,
+ a_offset, b_offset));
+ }
return Status{};
}
@@ -285,8 +356,16 @@
CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
}
- // Run offset contribution kernel
- CLScheduler::get().enqueue(_offset_contribution_kernel, true);
+ if(_fuse_output_stage)
+ {
+ // Run offset contribution/output stage kernel
+ CLScheduler::get().enqueue(_offset_contribution_output_stage_kernel, true);
+ }
+ else
+ {
+ // Run offset contribution kernel
+ CLScheduler::get().enqueue(_offset_contribution_kernel, true);
+ }
_memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index 16d8678..f1282cb 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,11 +25,12 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
{
auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
@@ -42,15 +43,33 @@
return CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(input, bias, output, min, max);
}
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
- int result_offset_after_shift, int min, int max)
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
+ int min, int max)
{
auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
_kernel = std::move(k);
}
-Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+ int min, int max)
{
return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
-}
\ No newline at end of file
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+ float multiplier, int offset,
+ int min, int max)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel>();
+ k->configure(input, bias, output, multiplier, offset, min, max);
+ _kernel = std::move(k);
+}
+
+Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+ int min, int max)
+{
+ return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::validate(input, bias, output, min, max);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
new file mode 100644
index 0000000..5dd1202
--- /dev/null
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)),
+ _permute_deltas_kernel(),
+ _flatten_deltas_kernel(),
+ _permute_scores_kernel(),
+ _flatten_scores_kernel(),
+ _compute_anchors_kernel(),
+ _bounding_box_kernel(),
+ _memset_kernel(),
+ _padded_copy_kernel(),
+ _cpp_nms_kernel(),
+ _deltas_permuted(),
+ _deltas_flattened(),
+ _scores_permuted(),
+ _scores_flattened(),
+ _all_anchors(),
+ _all_proposals(),
+ _keeps_nms_unused(),
+ _classes_nms_unused(),
+ _proposals_4_roi_values(),
+ _num_valid_proposals(nullptr),
+ _scores_out(nullptr)
+{
+}
+
+void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
+ const GenerateProposalsInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
+ ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+
+ const DataType data_type = deltas->info()->data_type();
+ const int num_anchors = scores->info()->dimension(2);
+ const int feat_width = scores->info()->dimension(0);
+ const int feat_height = scores->info()->dimension(1);
+ const int total_num_anchors = num_anchors * feat_width * feat_height;
+ const int pre_nms_topN = info.pre_nms_topN();
+ const int post_nms_topN = info.post_nms_topN();
+ const size_t values_per_roi = info.values_per_roi();
+
+ // Compute all the anchors
+ _memory_group.manage(&_all_anchors);
+ _compute_anchors_kernel.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+
+ const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
+ _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, data_type));
+
+ // Permute and reshape deltas
+ _memory_group.manage(&_deltas_permuted);
+ _memory_group.manage(&_deltas_flattened);
+ _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+ _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+ _deltas_permuted.allocator()->allocate();
+
+ const TensorShape flatten_shape_scores(1, total_num_anchors);
+ _scores_flattened.allocator()->init(TensorInfo(flatten_shape_scores, 1, data_type));
+
+ // Permute and reshape scores
+ _memory_group.manage(&_scores_permuted);
+ _memory_group.manage(&_scores_flattened);
+ _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+ _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+ _scores_permuted.allocator()->allocate();
+
+ // Bounding box transform
+ _memory_group.manage(&_all_proposals);
+ BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f);
+ _bounding_box_kernel.configure(&_all_anchors, &_all_proposals, &_deltas_flattened, bbox_info);
+ _deltas_flattened.allocator()->allocate();
+ _all_anchors.allocator()->allocate();
+
+ // The original layer implementation first selects the best pre_nms_topN anchors (thus having a lightweight sort)
+ // that are then transformed by bbox_transform. The boxes generated are then fed into a non-sorting NMS operation.
+ // Since we are reusing the NMS layer and we don't implement any CL/sort, we let NMS do the sorting (of all the input)
+ // and the filtering
+ const int scores_nms_size = std::min<int>(std::min<int>(post_nms_topN, pre_nms_topN), total_num_anchors);
+ const float min_size_scaled = info.min_size() * info.im_scale();
+ _memory_group.manage(&_classes_nms_unused);
+ _memory_group.manage(&_keeps_nms_unused);
+
+ // Note that NMS needs outputs preinitialized.
+ auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, data_type);
+ auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, data_type);
+ auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
+
+ // Initialize temporaries (unused) outputs
+ _classes_nms_unused.allocator()->init(TensorInfo(TensorShape(1, 1), 1, data_type));
+ _keeps_nms_unused.allocator()->init(*scores_out->info());
+
+ // Save the output (to map and unmap them at run)
+ _scores_out = scores_out;
+ _num_valid_proposals = num_valid_proposals;
+
+ _memory_group.manage(&_proposals_4_roi_values);
+ _cpp_nms_kernel.configure(&_scores_flattened, &_all_proposals, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
+ BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()));
+ _keeps_nms_unused.allocator()->allocate();
+ _classes_nms_unused.allocator()->allocate();
+ _all_proposals.allocator()->allocate();
+ _scores_flattened.allocator()->allocate();
+
+ // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
+ _padded_copy_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+ _proposals_4_roi_values.allocator()->allocate();
+
+ _memset_kernel.configure(proposals, PixelValue());
+}
+
+Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
+ const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW);
+
+ const int num_anchors = scores->dimension(2);
+ const int feat_width = scores->dimension(0);
+ const int feat_height = scores->dimension(1);
+ const int num_images = scores->dimension(3);
+ const int total_num_anchors = num_anchors * feat_width * feat_height;
+ const int values_per_roi = info.values_per_roi();
+
+ ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
+
+ TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+
+ TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+
+ TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
+
+ TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+
+ TensorInfo scores_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+ TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&scores_permuted_info, &scores_flattened_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, BoundingBoxTransformInfo(info.im_width(), info.im_height(),
+ 1.f)));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(&proposals_4_roi_values, proposals, PaddingList{ { 0, 1 } }));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(proposals, PixelValue()));
+
+ if(num_valid_proposals->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
+ }
+
+ if(proposals->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(proposals, deltas);
+ }
+
+ if(scores_out->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores_out, scores);
+ }
+
+ return Status{};
+}
+
+void CLGenerateProposalsLayer::run_cpp_nms_kernel()
+{
+ // Map inputs
+ _scores_flattened.map(true);
+ _all_proposals.map(true);
+
+ // Map outputs
+ _scores_out->map(CLScheduler::get().queue(), true);
+ _proposals_4_roi_values.map(CLScheduler::get().queue(), true);
+ _num_valid_proposals->map(CLScheduler::get().queue(), true);
+ _keeps_nms_unused.map(true);
+ _classes_nms_unused.map(true);
+
+ // Run nms
+ CPPScheduler::get().schedule(&_cpp_nms_kernel, Window::DimX);
+
+ // Unmap outputs
+ _keeps_nms_unused.unmap();
+ _classes_nms_unused.unmap();
+ _scores_out->unmap(CLScheduler::get().queue());
+ _proposals_4_roi_values.unmap(CLScheduler::get().queue());
+ _num_valid_proposals->unmap(CLScheduler::get().queue());
+
+ // Unmap inputs
+ _scores_flattened.unmap();
+ _all_proposals.unmap();
+}
+
+void CLGenerateProposalsLayer::run()
+{
+ // Acquire all the temporaries
+ _memory_group.acquire();
+
+ // Compute all the anchors
+ CLScheduler::get().enqueue(_compute_anchors_kernel, false);
+
+ // Transpose and reshape the inputs
+ CLScheduler::get().enqueue(_permute_deltas_kernel, false);
+ CLScheduler::get().enqueue(_flatten_deltas_kernel, false);
+ CLScheduler::get().enqueue(_permute_scores_kernel, false);
+ CLScheduler::get().enqueue(_flatten_scores_kernel, false);
+
+ // Build the boxes
+ CLScheduler::get().enqueue(_bounding_box_kernel, false);
+ // Non maxima suppression
+ run_cpp_nms_kernel();
+ // Add dummy batch indexes
+ CLScheduler::get().enqueue(_memset_kernel, true);
+ CLScheduler::get().enqueue(_padded_copy_kernel, true);
+
+ // Release all the temporaries
+ _memory_group.release();
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 423faea..65ce7de 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -65,7 +65,7 @@
float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
{
- ARM_COMPUTE_UNUSED(use_fp16);
+ ARM_COMPUTE_UNUSED(use_fp16); //TODO(COMPMID-772): Add half float support
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index a3010a7..4f709d5 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -63,8 +63,8 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
- // Reduce shape on axis (supported axis is 0)
- shape.set(0, 1);
+ // Reduce shape on axis
+ shape.set(axis, 1);
sum_sq.set_tensor_shape(shape);
ARM_COMPUTE_RETURN_ON_ERROR(CLL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 3458135..a89c4e3 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -130,7 +130,6 @@
_forget_gate_out3.allocator()->allocate();
}
_activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
- forget_gate_out->allocator()->allocate();
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -195,7 +194,6 @@
_activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
_memory_group.manage(&_cell_state_out5);
_pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _input_gate_out1.allocator()->allocate();
_cell_state_out4.allocator()->allocate();
_pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
_forget_gate_out1.allocator()->allocate();
@@ -246,7 +244,6 @@
_output1.allocator()->allocate();
}
_activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
- output_gate_out->allocator()->allocate();
// Configure block that calculates the output state
/** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -281,12 +278,11 @@
// Copy cell state and output
_copy_cell_state.configure(&_cell_state_out1, cell_state_out);
- _cell_state_out1.allocator()->allocate();
_copy_output.configure(output_state_out, output);
// Vector for holding the tensors to store in scratch buffer
std::vector<ICLTensor *> scratch_inputs;
- if(lstm_params.has_cifg_opt())
+ if(!lstm_params.has_cifg_opt())
{
scratch_inputs.emplace_back(&_input_gate_out1);
}
@@ -294,6 +290,10 @@
scratch_inputs.emplace_back(forget_gate_out);
scratch_inputs.emplace_back(output_gate_out);
_concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+ _input_gate_out1.allocator()->allocate();
+ _cell_state_out1.allocator()->allocate();
+ forget_gate_out->allocator()->allocate();
+ output_gate_out->allocator()->allocate();
}
Status CLLSTMLayer::validate(const ITensorInfo *input,
@@ -444,7 +444,7 @@
// Validate scratch concatenation
std::vector<ITensorInfo *> inputs_vector_info_raw;
- if(lstm_params.has_cifg_opt())
+ if(!lstm_params.has_cifg_opt())
{
inputs_vector_info_raw.push_back(&input_gate);
}
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 40bf032..5c6bef9 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -122,7 +122,7 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
ARM_COMPUTE_RETURN_ON_ERROR(CLWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
ARM_COMPUTE_RETURN_ON_ERROR(CLLocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(&gemm_output_info, output, std::make_pair(conv_w, conv_h)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
return Status{};
}
@@ -163,7 +163,7 @@
_input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
_weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
_mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
- _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+ _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
// Allocate intermediate tensors
_input_im2col_reshaped.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
new file mode 100644
index 0000000..11d70e3
--- /dev/null
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLNormalizePlanarYUVLayer::CLNormalizePlanarYUVLayer()
+ : _norm_kernel()
+{
+}
+
+void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+{
+ _norm_kernel.configure(input, output, mean, std);
+}
+
+Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *mean, const ITensorInfo *std)
+{
+ return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std);
+}
+
+void CLNormalizePlanarYUVLayer::run()
+{
+ CLScheduler::get().enqueue(_norm_kernel, true);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
new file mode 100644
index 0000000..de43c7d
--- /dev/null
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPadLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+CLPadLayer::CLPadLayer()
+ : _copy_kernel(), _fillborder_kernel(), _memset_kernel()
+{
+}
+
+void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding)
+{
+ // Copy the input to the output
+ _copy_kernel.configure(input, output, padding);
+
+ // Set the pages of the output to zero
+ _memset_kernel.configure(output, PixelValue());
+
+ // Fill padding on the first two dimensions with zeros
+ _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT);
+}
+
+Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, PixelValue()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, padding));
+
+ return Status{};
+}
+
+void CLPadLayer::run()
+{
+ CLScheduler::get().enqueue(_memset_kernel, false);
+ CLScheduler::get().enqueue(_fillborder_kernel, false);
+ CLScheduler::get().enqueue(_copy_kernel, true);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
new file mode 100644
index 0000000..4f6c969
--- /dev/null
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLPriorBoxLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLPriorBoxLayer::CLPriorBoxLayer()
+ : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
+{
+}
+
+void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+{
+ _min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float));
+ _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float));
+ if(!info.max_sizes().empty())
+ {
+ _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.max_sizes().size() * sizeof(float));
+ }
+
+ auto k = arm_compute::support::cpp14::make_unique<CLPriorBoxLayerKernel>();
+ k->configure(input1, input2, output, info, &_min, &_max, &_aspect_ratios);
+ _kernel = std::move(k);
+}
+
+Status CLPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+{
+ return CLPriorBoxLayerKernel::validate(input1, input2, output, info);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
new file mode 100644
index 0000000..5bfd594
--- /dev/null
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info));
+
+ return Status{};
+}
+
+void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+ // Configure ROI pooling kernel
+ auto k = arm_compute::support::cpp14::make_unique<CLROIAlignLayerKernel>();
+ k->configure(input, rois, output, pool_info);
+ _kernel = std::move(k);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
new file mode 100644
index 0000000..1016ff7
--- /dev/null
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReduceMean.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
+{
+}
+void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ _reduction_ops = reduction_axis.num_dimensions();
+ _reduction_kernels = arm_compute::support::cpp14::make_unique<CLReductionOperation[]>(_reduction_ops);
+ _reduced_outs = arm_compute::support::cpp14::make_unique<CLTensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
+ _keep_dims = keep_dims;
+
+ // Perform reduction for every axis
+ for(unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+ out_shape.set(reduction_axis[i], 1);
+ auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+
+ if(i == _reduction_ops - 1 && keep_dims)
+ {
+ _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM);
+ }
+ else
+ {
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
+ _memory_group.manage(_reduced_outs.get() + i);
+ _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM);
+ }
+ }
+
+ // Allocate intermediate tensors
+ for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ {
+ _reduced_outs[i].allocator()->allocate();
+ }
+
+ // Configure reshape layer if we want to drop the dimensions
+ if(!keep_dims)
+ {
+ TensorShape out_shape = input->info()->tensor_shape();
+
+ // We have to sort the reduction axis vectors in order for remove_dimension
+ // to work properly
+ Coordinates axis_copy = reduction_axis;
+ std::sort(axis_copy.begin(), axis_copy.begin() + _reduction_ops);
+ for(unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ out_shape.remove_dimension(axis_copy[i] - i);
+ }
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+ _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+ }
+}
+
+Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+{
+ ARM_COMPUTE_UNUSED(keep_dims);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+ for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(reduction_axis[i]) > input->num_dimensions() - 1);
+ if(output->total_size() > 0 && keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1);
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM));
+ }
+
+ return Status{};
+}
+
+void CLReduceMean::run()
+{
+ _memory_group.acquire();
+
+ for(unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ _reduction_kernels[i].run();
+ }
+
+ if(!_keep_dims)
+ {
+ _reshape.run();
+ }
+ _memory_group.release();
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 2a171c3..c5447ff 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -37,8 +37,13 @@
namespace
{
-unsigned int calculate_number_of_stages(const ITensorInfo *input)
+unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int axis)
{
+ // We need only 1 stage for all axis except x-axis and x-axis for QASYMM8.
+ if(axis != 0 || (axis == 0 && is_data_type_quantized(input->data_type())))
+ {
+ return 1;
+ }
// Calculate number of WGs. 16 elements per thread, 8 threads per WG
const unsigned int num_of_wg = ceil(input->dimension(0) / 128.f);
@@ -51,91 +56,149 @@
} // namespace
CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
+ : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_quantized()
{
}
Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
- const unsigned int num_of_stages = calculate_number_of_stages(input);
+ const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
- // Create temporary tensor infos
- auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
-
- // Create intermediate tensor info
- TensorShape shape{ input->tensor_shape() };
-
- for(unsigned int i = 0; i < num_of_stages - 1; i++)
+ if(axis == 0 && !is_data_type_quantized(input->data_type()))
{
- shape.set(0, ceil(shape.x() / 128.f));
- sums_vector[i].set_data_type(input->data_type());
- sums_vector[i].set_tensor_shape(shape);
- sums_vector[i].set_num_channels(input->num_channels());
+ // Create temporary tensor infos
+ auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
+
+ // Create intermediate tensor info
+ TensorShape shape{ input->tensor_shape() };
+
+ for(unsigned int i = 0; i < num_of_stages - 1; i++)
+ {
+ shape.set(0, ceil(shape.x() / 128.f));
+ sums_vector[i].set_data_type(input->data_type());
+ sums_vector[i].set_tensor_shape(shape);
+ sums_vector[i].set_num_channels(input->num_channels());
+ }
+
+ ReductionOperation first_kernel_op;
+ ReductionOperation last_kernel_op;
+ switch(op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ first_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = op;
+ break;
+ case ReductionOperation::SUM_SQUARE:
+ first_kernel_op = ReductionOperation::SUM_SQUARE;
+ last_kernel_op = ReductionOperation::SUM;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+
+ // Validate ReductionOperation only on first kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, first_kernel_op));
+
+ // Validate ReductionOperation on intermediate stages
+ for(unsigned int i = 1; i < num_of_stages - 1; ++i)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, ReductionOperation::SUM));
+ }
+
+ // Validate ReductionOperation on the last stage
+ const unsigned int last_stage = num_of_stages - 1;
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
}
-
- // Validate ReductionOperation only on first kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, op));
-
- // Validate ReductionOperation on intermediate stages
- for(unsigned int i = 1; i < num_of_stages - 1; ++i)
+ else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, op));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
}
- // Validate ReductionOperation on the last stage
- const unsigned int last_stage = num_of_stages - 1;
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, op));
-
return Status{};
}
void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
{
- _num_of_stages = calculate_number_of_stages(input->info());
-
- // Create temporary tensors
- _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+ _num_of_stages = calculate_number_of_stages(input->info(), axis);
+ _reduction_axis = axis;
+ _is_quantized = is_data_type_quantized(input->info()->data_type());
// Configure reduction operation kernels
_reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
- _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
- TensorShape shape{ input->info()->tensor_shape() };
- for(unsigned int i = 0; i < _num_of_stages - 1; i++)
+ // Create temporary tensors
+ if(axis == 0 && !_is_quantized)
{
- shape.set(0, ceil(shape.x() / 128.f));
- _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+ _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
+ _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+ TensorShape shape{ input->info()->tensor_shape() };
+ for(unsigned int i = 0; i < _num_of_stages - 1; i++)
+ {
+ shape.set(0, ceil(shape.x() / 128.f));
+ _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+ }
+
+ // Apply ReductionOperation only on first kernel
+ _memory_group.manage(_sums_vector.get());
+
+ ReductionOperation first_kernel_op;
+ ReductionOperation last_kernel_op;
+ switch(op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ first_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = op;
+ break;
+ case ReductionOperation::SUM_SQUARE:
+ first_kernel_op = ReductionOperation::SUM_SQUARE;
+ last_kernel_op = ReductionOperation::SUM;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+
+ _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, first_kernel_op);
+ _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+ // Apply ReductionOperation on intermediate stages
+ for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
+ {
+ _memory_group.manage(_sums_vector.get() + i);
+ _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
+ _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _sums_vector[i - 1].allocator()->allocate();
+ }
+
+ // Apply ReductionOperation on the last stage
+ const unsigned int last_stage = _num_of_stages - 1;
+ const unsigned int input_width = input->info()->dimension(0);
+ _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
+ _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _sums_vector[last_stage - 1].allocator()->allocate();
}
-
- // Apply ReductionOperation only on first kernel
- _memory_group.manage(_sums_vector.get());
- _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, op);
- _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
-
- // Apply ReductionOperation on intermediate stages
- for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
+ else
{
- _memory_group.manage(_sums_vector.get() + i);
- _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
- _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
- _sums_vector[i - 1].allocator()->allocate();
+ _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
}
-
- // Apply ReductionOperation on the last stage
- const unsigned int last_stage = _num_of_stages - 1;
- _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, ReductionOperation::SUM);
- _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
- _sums_vector[last_stage - 1].allocator()->allocate();
}
void CLReductionOperation::run()
{
_memory_group.acquire();
- for(unsigned int i = 0; i < _num_of_stages; ++i)
+ if(_reduction_axis == 0 && !_is_quantized)
{
- CLScheduler::get().enqueue(_border_handlers_vector[i], false);
- CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+ for(unsigned int i = 0; i < _num_of_stages; ++i)
+ {
+ CLScheduler::get().enqueue(_border_handlers_vector[i], false);
+ CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+ }
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
}
_memory_group.release();
diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp
new file mode 100644
index 0000000..8e04d16
--- /dev/null
+++ b/src/runtime/CL/functions/CLReorgLayer.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReorgLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLReorgLayerKernel>();
+ k->configure(input, output, stride);
+ _kernel = std::move(k);
+}
+
+Status CLReorgLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
+{
+ return CLReorgLayerKernel::validate(input, output, stride);
+}
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index 2ce83dc..b98a99d 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,7 @@
#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
#include "support/ToolchainSupport.h"
+/** [CLReshapeLayer snippet] **/
using namespace arm_compute;
void CLReshapeLayer::configure(const ICLTensor *input, ICLTensor *output)
@@ -35,3 +36,9 @@
k->configure(input, output);
_kernel = std::move(k);
}
+
+Status CLReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLReshapeLayerKernel::validate(input, output);
+}
+/** [CLReshapeLayer snippet] **/
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 4ff9763..f204e64 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -50,3 +50,9 @@
}
_border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
}
+
+Status CLScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+{
+ ARM_COMPUTE_UNUSED(constant_border_value);
+ return CLScaleKernel::validate(input, output, policy, border_mode, sampling_policy);
+}
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
new file mode 100644
index 0000000..bef7eca
--- /dev/null
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSlice.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ // Get absolute end coordinates
+ const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->info()->tensor_shape(), ends);
+
+ auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
+ k->configure(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+ _kernel = std::move(k);
+}
+
+Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+
+ // Check start dimensions for being non-negative
+ ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
+ {
+ return i < 0;
+ }));
+
+ // Get absolute end coordinates
+ const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->tensor_shape(), ends);
+
+ return CLStridedSliceKernel::validate(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 7a20d9f..d671846 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -29,29 +29,80 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+ : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_kernel_ptr(), _reshape_kernel(), _max(), _sum(), _tmp(), _input_flattened(), _output_flattened(),
+ _needs_flattening(false)
{
}
-void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta)
+void CLSoftmaxLayer::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis)
+{
+ // Flatten the input
+ const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
+
+ // Initialize the flat input
+ _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
+
+ // If we need to flatten the input, we can use CLFlattenKernel or CLReshapeKernel
+ // If flattening on the third axes, we use CLFlattenKernel.
+ // In all other cases we have to use CLReshapeKernel
+ if(axis != 3)
+ {
+ auto reshape_kernel_ptr = support::cpp14::make_unique<CLReshapeLayerKernel>();
+ reshape_kernel_ptr->configure(input, &_input_flattened);
+ _flatten_kernel_ptr = std::move(reshape_kernel_ptr);
+ }
+ else
+ {
+ auto flatten_kernel_ptr = support::cpp14::make_unique<CLFlattenLayerKernel>();
+ flatten_kernel_ptr->configure(input, &_input_flattened);
+ _flatten_kernel_ptr = std::move(flatten_kernel_ptr);
+ }
+
+ // We need to init the output tensor here. Indeed, the reshape kernel expects
+ // both tensors to be already initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+}
+
+void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
{
// Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayer::validate(input->info(), output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayer::validate(input->info(), output->info(), beta, axis));
+
+ // We don't need flattening only in the case the input is 2D and axis is 1
+ _needs_flattening = axis != 1;
+
+ // If we are dealing with a 4D tensor, we will:
+ // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
+ // - Execute all the pipeline (reduction + normalization) on the flattened tensor
+ // - Reshape the flattened output into the real output
+ if(_needs_flattening)
+ {
+ // Add to the memory manager _input_flattened
+ _memory_group.manage(&_input_flattened);
+
+ // Cofigure _flatten_kernel and _input_flattened
+ configure_reshape_input_kernel(input, output, axis);
+ }
+
+ // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
+ // or it is the original input case (2D case)
+ const ICLTensor *input_2D = (_needs_flattening ? &_input_flattened : input);
// Create intermediate tensors shapes
- const TensorInfo input_info = input->info()->clone()->reset_padding().set_is_resizable(true);
- DataType tmp_data_type = is_data_type_quantized_asymmetric(input->info()->data_type()) ? DataType::S32 : input->info()->data_type();
- TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
+ TensorInfo input_info = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
+ DataType tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::S32 : input_2D->info()->data_type();
+ TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
_tmp.allocator()->init(tensor_info_tmp);
- TensorShape max_sum_shape = input->info()->tensor_shape();
+ TensorShape max_sum_shape = input_2D->info()->tensor_shape();
max_sum_shape.set(0, 1);
_max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
_sum.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type));
@@ -65,8 +116,28 @@
_memory_group.manage(&_sum);
// Configure kernels
- _max_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
- _norm_kernel.configure(&_tmp, &_sum, output, beta);
+ _max_shift_exp_sum_kernel.configure(input_2D, &_max, &_tmp, &_sum, beta);
+
+ if(_needs_flattening)
+ {
+ // Add to the memory manager _output_flattened
+ _memory_group.manage(&_output_flattened);
+
+ // The normalization kernel stores the result in a flat output tensor
+ _norm_kernel.configure(&_tmp, &_sum, &_output_flattened, beta);
+
+ // Reshape the flat output into a the requested (4D) output
+ _reshape_kernel.configure(&_output_flattened, output);
+
+ // Allocate the intermediate flat tensors
+ _input_flattened.allocator()->allocate();
+ _output_flattened.allocator()->allocate();
+ }
+ else
+ {
+ // Softmax 2D case
+ _norm_kernel.configure(&_tmp, &_sum, output, beta);
+ }
// Allocate intermediate buffers
_tmp.allocator()->allocate();
@@ -74,10 +145,11 @@
_sum.allocator()->allocate();
}
-Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
+ ARM_COMPUTE_UNUSED(beta);
// Create intermediate tensor info
DataType tmp_data_type = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type();
@@ -88,9 +160,32 @@
TensorInfo tensor_info_max(input->clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
TensorInfo tensor_info_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
+ const bool needs_flattening = (axis != 1);
+
+ if(needs_flattening)
+ {
+ const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, axis);
+ TensorInfo tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
+
+ if(axis != 3)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(input, &tensor_info_flat));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayerKernel::validate(input, &tensor_info_flat));
+ }
+ }
+
ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum));
ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DNormKernel::validate(&tensor_info_tmp, &tensor_info_sum, output));
+ if(needs_flattening)
+ {
+ const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input);
+ TensorInfo tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
+ }
+
return Status{};
}
@@ -98,8 +193,21 @@
{
_memory_group.acquire();
- CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
- CLScheduler::get().enqueue(_norm_kernel);
+ if(_needs_flattening)
+ {
+ CLScheduler::get().enqueue(*_flatten_kernel_ptr, false);
+ }
+ CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
+ CLScheduler::get().enqueue(_norm_kernel, !_needs_flattening);
+
+ if(_needs_flattening)
+ {
+ CLScheduler::get().enqueue(_reshape_kernel, true);
+ }
+
+ // Relase intermediate buffers
_memory_group.release();
}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
new file mode 100644
index 0000000..76c1e18
--- /dev/null
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLSpaceToBatchLayer::CLSpaceToBatchLayer()
+ : _space_to_batch_kernel(), _output(nullptr), _has_padding(false)
+{
+}
+
+void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ {
+ _has_padding = true;
+ }
+
+ _output = output;
+ _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+}
+
+void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ {
+ _has_padding = true;
+ }
+
+ _output = output;
+ _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+}
+
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+{
+ return CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output);
+}
+
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+ const ITensorInfo *output)
+{
+ return CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+}
+
+void CLSpaceToBatchLayer::run()
+{
+ // Zero out output only if we have paddings
+ // TODO(micspy01): replace with memset once ready
+ if(_has_padding)
+ {
+ _output->map(CLScheduler::get().queue(), true);
+ if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
+ {
+ const uint8_t quantized_zero = _output->info()->quantization_info().offset;
+ std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
+ }
+ else
+ {
+ memset(_output->buffer(), 0, _output->info()->total_size());
+ }
+ _output->unmap(CLScheduler::get().queue());
+ }
+
+ CLScheduler::get().enqueue(_space_to_batch_kernel, true);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
new file mode 100644
index 0000000..f084351
--- /dev/null
+++ b/src/runtime/CL/functions/CLSplit.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSplit.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+CLSplit::CLSplit()
+ : _outputs_vector(), _slice_functions(), _num_outputs(0)
+{
+}
+
+void CLSplit::configure(const ICLTensor *input, const std::vector<ICLTensor *> &outputs, unsigned int axis)
+{
+ // Create Slice functions
+ _num_outputs = outputs.size();
+ _slice_functions = arm_compute::support::cpp14::make_unique<CLSlice[]>(_num_outputs);
+
+ // Get output shape
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs);
+
+ // Extract output tensor info
+ std::vector<ITensorInfo *> outputs_info;
+ for(auto &output : outputs)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ outputs_info.emplace_back(output->info());
+ }
+
+ // Validate
+ ARM_COMPUTE_ERROR_THROW_ON(CLSplit::validate(input->info(), outputs_info, axis));
+
+ const size_t axis_split_step = output_shape[axis];
+ unsigned int axis_offset = 0;
+
+ // Start/End coordinates
+ Coordinates start_coords;
+ Coordinates end_coords;
+ for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+ {
+ end_coords.set(d, -1);
+ }
+
+ for(unsigned int i = 0; i < _num_outputs; i++)
+ {
+ // Update coordinate on axis
+ start_coords.set(axis, axis_offset);
+ end_coords.set(axis, axis_offset + axis_split_step);
+
+ // Configure slice function
+ _slice_functions[i].configure(input, outputs[i], start_coords, end_coords);
+
+ // Set valid region from shape
+ outputs[i]->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
+
+ // Update axis offset
+ axis_offset += axis_split_step;
+ }
+}
+
+Status CLSplit::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &outputs, unsigned int axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis >= input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(outputs.size() < 2);
+
+ // Get output shape
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input, axis, outputs.size());
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0);
+
+ const size_t axis_split_step = output_shape[axis];
+ unsigned int axis_offset = 0;
+
+ // Start/End coordinates
+ Coordinates start_coords;
+ Coordinates end_coords;
+ for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+ {
+ end_coords.set(d, -1);
+ }
+
+ // Validate output tensors
+ for(const auto &output : outputs)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+ // Output auto inizialitation if not yet initialized
+ TensorInfo tmp_output_info = *output->clone();
+ auto_init_if_empty(tmp_output_info, input->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+
+ // Update coordinate on axis
+ start_coords.set(axis, axis_offset);
+ end_coords.set(axis, axis_offset + axis_split_step);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(input, output, start_coords, end_coords));
+ axis_offset += axis_split_step;
+ }
+
+ return Status{};
+}
+
+void CLSplit::run()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ for(unsigned i = 0; i < _num_outputs; ++i)
+ {
+ _slice_functions[i].run();
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
new file mode 100644
index 0000000..e34f653
--- /dev/null
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
+
+#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
+ k->configure(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ _kernel = std::move(k);
+}
+
+Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLUpsampleLayer.cpp b/src/runtime/CL/functions/CLUpsampleLayer.cpp
new file mode 100644
index 0000000..1dad325
--- /dev/null
+++ b/src/runtime/CL/functions/CLUpsampleLayer.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLUpsampleLayer.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLUpsampleLayer::CLUpsampleLayer() // NOLINT
+ : _upsample(),
+ _output(nullptr)
+{
+}
+
+Status CLUpsampleLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const Size2D &info, const InterpolationPolicy upsampling_policy)
+{
+ return CLUpsampleLayerKernel::validate(input, output, info, upsampling_policy);
+}
+
+void CLUpsampleLayer::configure(ICLTensor *input, ICLTensor *output,
+ const Size2D &info, const InterpolationPolicy upsampling_policy)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ _output = output;
+ _upsample.configure(input, _output, info, upsampling_policy);
+}
+
+void CLUpsampleLayer::run()
+{
+ CLScheduler::get().enqueue(_upsample, false);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index 5233ff4..46a2d80 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
@@ -36,26 +36,46 @@
CLWidthConcatenateLayer::CLWidthConcatenateLayer() // NOLINT
: _concat_kernels_vector(),
+ _concat_x2_kernel(),
+ _concat_x4_kernel(),
_num_inputs(0)
{
}
Status CLWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output) // NOLINT
{
+ const unsigned int num_inputs = inputs_vector.size();
+
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
// Output auto inizialitation if not yet initialized
TensorInfo tmp_output_info = *output->clone();
TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
- unsigned int width_offset = 0;
- for(const auto &input : inputs_vector)
+ switch(num_inputs)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
- width_offset += input->dimension(0);
+ case 2:
+ // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], &tmp_output_info));
+ break;
+ case 4:
+ // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], &tmp_output_info));
+ break;
+ default:
+ unsigned int width_offset = 0;
+ // Validate generic case of WidthConcatenate kernel
+ for(const auto &input : inputs_vector)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
+ width_offset += input->dimension(0);
+ }
+ break;
}
return Status{};
@@ -74,16 +94,30 @@
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+
ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
- unsigned int width_offset = 0;
-
- _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
-
- for(unsigned int i = 0; i < _num_inputs; i++)
+ switch(_num_inputs)
{
- _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
- width_offset += inputs_vector.at(i)->info()->dimension(0);
+ case 2:
+ // Configure WidthConcatenate2Tensors kernel
+ _concat_x2_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), output);
+ break;
+ case 4:
+ // Configure WidthConcatenate4Tensors kernel
+ _concat_x4_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
+ break;
+ default:
+ // Configure generic case WidthConcatenate kernels
+ _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
+
+ unsigned int width_offset = 0;
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
+ width_offset += inputs_vector.at(i)->info()->dimension(0);
+ }
+ break;
}
}
@@ -91,8 +125,19 @@
{
cl::CommandQueue q = CLScheduler::get().queue();
- for(unsigned i = 0; i < _num_inputs; i++)
+ switch(_num_inputs)
{
- CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+ case 2:
+ CLScheduler::get().enqueue(_concat_x2_kernel, true);
+ break;
+ case 4:
+ CLScheduler::get().enqueue(_concat_x4_kernel, true);
+ break;
+ default:
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+ }
+ break;
}
}
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index a70389a..1abcb67 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -104,9 +104,9 @@
// Check if the Winograd configuration requires fast math
if(!enable_fast_math)
{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
}
-
const WinogradInfo winograd_info = WinogradInfo(output_tile,
kernel_size,
input_dims,
@@ -129,7 +129,8 @@
_filter_transform.configure(weights, &_input1, winograd_info);
// Configure batched matrix multiply
- _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+ _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, GEMMLowpOutputStageInfo(),
+ (input->info()->data_type() == DataType::F16)));
// Configure output transform
_output_transform.configure(&_batched_mm_output, biases, output, winograd_info);
@@ -161,6 +162,7 @@
// Check if the Winograd configuration requires fast math
if(!enable_fast_math)
{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
}
@@ -184,7 +186,8 @@
TensorShape batched_mm_output_shape = input0.tensor_shape();
batched_mm_output_shape[0] = input1.tensor_shape()[0];
const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+ GEMMLowpOutputStageInfo(), (input->data_type() == DataType::F16))));
// Configure output transform
ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, winograd_info));
diff --git a/src/runtime/CL/functions/CLYOLOLayer.cpp b/src/runtime/CL/functions/CLYOLOLayer.cpp
new file mode 100644
index 0000000..5a612ba
--- /dev/null
+++ b/src/runtime/CL/functions/CLYOLOLayer.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLYOLOLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLYOLOLayer::configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLYOLOLayerKernel>();
+ k->configure(input, output, act_info, num_classes);
+ _kernel = std::move(k);
+}
+
+Status CLYOLOLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+ return CLYOLOLayerKernel::validate(input, output, act_info, num_classes);
+}
diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
index 2d52f33..187f52f 100644
--- a/src/runtime/CL/tuners/BifrostTuner.cpp
+++ b/src/runtime/CL/tuners/BifrostTuner.cpp
@@ -132,9 +132,12 @@
// Configure the local work size for Bifrost with a value obtained
// via exhaustive autotuning over 30 representative tensor shapes.
- if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
+ if(gpu_target_is_in(gpu_target,
+ GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+ GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+ GPUTarget::G52, GPUTarget::G52LIT))
{
- if((k._convolved_dims.first == 7) || (k._convolved_dims.first == 14))
+ if((k._convolved_dims.width == 7) || (k._convolved_dims.width == 14))
{
lws_hint = cl::NDRange(1, 7, 1);
}
@@ -153,7 +156,11 @@
const GPUTarget gpu_target = k.get_target();
// Local work size optimized for the 11x11 AlexNet convolution on Bifrost.
- if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76) && k._kernel_dims.width == 11)
+ if(gpu_target_is_in(gpu_target,
+ GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+ GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+ GPUTarget::G52, GPUTarget::G52LIT)
+ && k._kernel_dims.width == 11)
{
const bool is_square_kernel = (k._kernel_dims.width == k._kernel_dims.height);
if(!is_square_kernel && k._kernel_dims.width > 1 && !k._conv_info.has_padding())
@@ -171,7 +178,10 @@
// Configure the local work size for Bifrost with a value obtained
// via exhaustive autotuning for the MobileNets tensor shapes.
- if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
+ if(gpu_target_is_in(gpu_target,
+ GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+ GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+ GPUTarget::G52, GPUTarget::G52LIT))
{
lws_hint = cl::NDRange(1, 2, 1);
}
@@ -186,7 +196,10 @@
// Configure the local work size for Bifrost with a value obtained
// via exhaustive autotuning for the MobileNets tensor shapes.
- if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
+ if(gpu_target_is_in(gpu_target,
+ GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+ GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+ GPUTarget::G52, GPUTarget::G52LIT))
{
lws_hint = cl::NDRange(1, 1, 1);
}
@@ -207,6 +220,8 @@
case GPUTarget::G51:
case GPUTarget::G51BIG:
case GPUTarget::G51LIT:
+ case GPUTarget::G52:
+ case GPUTarget::G52LIT:
case GPUTarget::G76:
if(k._input1->info()->dimension(1) == 24)
{
@@ -240,7 +255,10 @@
// invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with).
if(k._input->info()->data_layout() == DataLayout::NCHW)
{
- if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
+ if(gpu_target_is_in(gpu_target,
+ GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+ GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+ GPUTarget::G52, GPUTarget::G52LIT))
{
cl::NDRange gws = ICLKernel::gws_from_window(k.window());
lws_hint = cl::NDRange(gws[0], gws[1], 1);
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index de28b4f..2b179fd 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -215,7 +215,6 @@
: _num_threads(num_threads_hint()),
_threads(_num_threads - 1)
{
- get_cpu_configuration(_cpu_info);
}
void CPPScheduler::set_num_threads(unsigned int num_threads)
@@ -229,6 +228,7 @@
return _num_threads;
}
+#ifndef DOXYGEN_SKIP_THIS
void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
{
const unsigned int num_threads = std::min(_num_threads, static_cast<unsigned int>(workloads.size()));
@@ -263,6 +263,7 @@
std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
}
}
+#endif /* DOXYGEN_SKIP_THIS */
void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
{
diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
new file mode 100644
index 0000000..2e10152
--- /dev/null
+++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
+
+#include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
+ ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CPPBoxWithNonMaximaSuppressionLimitKernel>();
+ k->configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
+ _kernel = std::move(k);
+}
\ No newline at end of file
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index 6c21086..ac19d08 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp
@@ -134,6 +134,9 @@
}
break;
case 0xd0b: // A76
+ case 0xd06:
+ case 0xd0c:
+ case 0xd0d:
model = CPUModel::GENERIC_FP16_DOT;
break;
default:
@@ -160,8 +163,8 @@
std::string line;
if(bool(getline(file, line)))
{
- const unsigned long midr = support::cpp11::stoul(line, nullptr, support::cpp11::NumericBase::BASE_16);
- c = midr_to_model(midr & 0xffffffff);
+ const uint32_t midr = support::cpp11::stoul(line, nullptr, support::cpp11::NumericBase::BASE_16);
+ c = midr_to_model(midr & 0xffffffff);
}
}
}
@@ -170,11 +173,11 @@
void populate_models_cpuinfo(std::vector<CPUModel> &cpusv)
{
// If "long-form" cpuinfo is present, parse that to populate models.
- std::regex proc_regex("^processor.*(\\d+)$");
- std::regex imp_regex("^CPU implementer.*0x(..)$");
- std::regex var_regex("^CPU variant.*0x(.)$");
- std::regex part_regex("^CPU part.*0x(...)$");
- std::regex rev_regex("^CPU revision.*(\\d+)$");
+ std::regex proc_regex(R"(^processor.*(\d+)$)");
+ std::regex imp_regex(R"(^CPU implementer.*0x(..)$)");
+ std::regex var_regex(R"(^CPU variant.*0x(.)$)");
+ std::regex part_regex(R"(^CPU part.*0x(...)$)");
+ std::regex rev_regex(R"(^CPU revision.*(\d+)$)");
std::ifstream file;
file.open("/proc/cpuinfo", std::ios::in);
@@ -317,10 +320,12 @@
hwcaps_fp16_support = true;
}
+#if defined(__aarch64__)
if((hwcaps & HWCAP_ASIMDDP) != 0)
{
hwcaps_dot_support = true;
}
+#endif /* defined(__aarch64__) */
const unsigned int max_cpus = get_max_cpus();
cpuinfo.set_cpu_num(max_cpus);
@@ -334,17 +339,18 @@
populate_models_cpuinfo(percpu);
}
int j(0);
- // Update dot product and FP16 support if all CPUs support these features:
- bool all_support_dot = true;
- bool all_support_fp16 = true;
+ // Update dot product and FP16 support if one of the CPUs support these features
+ // We assume that the system does not have mixed architectures
+ bool one_supports_dot = false;
+ bool one_supports_fp16 = false;
for(const auto &v : percpu)
{
- all_support_dot &= model_supports_dot(v);
- all_support_fp16 &= model_supports_fp16(v);
+ one_supports_dot = one_supports_dot || model_supports_dot(v);
+ one_supports_fp16 = one_supports_fp16 || model_supports_fp16(v);
cpuinfo.set_cpu_model(j++, v);
}
- cpuinfo.set_dotprod(all_support_dot || hwcaps_dot_support);
- cpuinfo.set_fp16(all_support_fp16 || hwcaps_fp16_support);
+ cpuinfo.set_dotprod(one_supports_dot || hwcaps_dot_support);
+ cpuinfo.set_fp16(one_supports_fp16 || hwcaps_fp16_support);
#else /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
ARM_COMPUTE_UNUSED(cpuinfo);
#endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
diff --git a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
index cdd12c3..70a1f4f 100644
--- a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
+++ b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
@@ -22,10 +22,10 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
#include <cstddef>
@@ -34,24 +34,26 @@
void *GCBufferAllocator::allocate(size_t size, size_t alignment)
{
ARM_COMPUTE_UNUSED(alignment);
- auto *gl_buffer = new GLBufferWrapper();
- ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, gl_buffer->_ssbo_name));
+
+ auto *gl_ssbo_name = new GLuint;
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, *gl_ssbo_name));
ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(size), nullptr, GL_STATIC_DRAW));
ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
- return reinterpret_cast<void *>(gl_buffer);
+ return reinterpret_cast<void *>(gl_ssbo_name);
}
void GCBufferAllocator::free(void *ptr)
{
ARM_COMPUTE_ERROR_ON(ptr == nullptr);
- auto *gl_buffer = reinterpret_cast<GLBufferWrapper *>(ptr);
- delete gl_buffer;
+ auto *gl_ssbo_name = reinterpret_cast<GLuint *>(ptr);
+ ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, gl_ssbo_name));
+ delete gl_ssbo_name;
}
std::unique_ptr<IMemoryRegion> GCBufferAllocator::make_region(size_t size, size_t alignment)
{
- ARM_COMPUTE_UNUSED(size, alignment);
- return nullptr;
+ ARM_COMPUTE_UNUSED(alignment);
+ return arm_compute::support::cpp14::make_unique<GCBufferMemoryRegion>(size);
}
} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCMemory.cpp b/src/runtime/GLES_COMPUTE/GCMemory.cpp
new file mode 100644
index 0000000..fed4a15
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/GCMemory.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemory.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
+
+namespace arm_compute
+{
+GCMemory::GCMemory()
+ : _region(nullptr), _region_owned(nullptr)
+{
+}
+
+GCMemory::GCMemory(std::shared_ptr<IGCMemoryRegion> memory)
+ : _region(nullptr), _region_owned(std::move(memory))
+{
+ _region_owned = memory;
+ _region = _region_owned.get();
+}
+
+GCMemory::GCMemory(IGCMemoryRegion *memory)
+ : _region(memory), _region_owned(nullptr)
+{
+ _region = memory;
+}
+
+IGCMemoryRegion *GCMemory::gc_region()
+{
+ return _region;
+}
+
+IGCMemoryRegion *GCMemory::gc_region() const
+{
+ return _region;
+}
+
+IMemoryRegion *GCMemory::region()
+{
+ return _region;
+}
+
+IMemoryRegion *GCMemory::region() const
+{
+ return _region;
+}
+
+void GCMemory::set_region(IMemoryRegion *region)
+{
+ auto gc_region = utils::cast::polymorphic_downcast<IGCMemoryRegion *>(region);
+ _region_owned = nullptr;
+ _region = gc_region;
+}
+
+void GCMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region)
+{
+ _region_owned = utils::cast::polymorphic_downcast_unique_ptr<IGCMemoryRegion>(std::move(region));
+ _region = _region_owned.get();
+}
+} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp b/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp
new file mode 100644
index 0000000..45fd6e8
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+IGCMemoryRegion::IGCMemoryRegion(size_t size)
+ : IMemoryRegion(size), _mapping(nullptr), _ssbo_name(0)
+{
+}
+
+const GLuint &IGCMemoryRegion::gc_ssbo_name() const
+{
+ return _ssbo_name;
+}
+
+void *IGCMemoryRegion::buffer()
+{
+ return _mapping;
+}
+
+void *IGCMemoryRegion::buffer() const
+{
+ return _mapping;
+}
+
+GCBufferMemoryRegion::GCBufferMemoryRegion(size_t size)
+ : IGCMemoryRegion(size)
+{
+ ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_ssbo_name));
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name));
+ ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(size), nullptr, GL_STATIC_DRAW));
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+}
+
+GCBufferMemoryRegion::~GCBufferMemoryRegion()
+{
+ ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, &_ssbo_name));
+}
+
+void *GCBufferMemoryRegion::ptr()
+{
+ return nullptr;
+}
+
+void *GCBufferMemoryRegion::map(bool blocking)
+{
+ ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
+ ARM_COMPUTE_UNUSED(blocking);
+
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name));
+ void *p = ARM_COMPUTE_GL_CHECK(glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, static_cast<GLsizeiptr>(size()), GL_MAP_READ_BIT | GL_MAP_WRITE_BIT));
+ _mapping = reinterpret_cast<uint8_t *>(p);
+
+ return _mapping;
+}
+
+void GCBufferMemoryRegion::unmap()
+{
+ ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name));
+ ARM_COMPUTE_GL_CHECK(glUnmapBuffer(GL_SHADER_STORAGE_BUFFER));
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+ _mapping = nullptr;
+}
+
+std::unique_ptr<IMemoryRegion> GCBufferMemoryRegion::extract_subregion(size_t offset, size_t size)
+{
+ ARM_COMPUTE_UNUSED(offset, size);
+ return nullptr;
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
index abd2b48..a0dd540 100644
--- a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
+++ b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
@@ -26,21 +26,17 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
#include "support/ToolchainSupport.h"
using namespace arm_compute;
GCTensorAllocator::GCTensorAllocator(GCTensor *owner)
- : _associated_memory_group(nullptr), _gl_buffer(), _mapping(nullptr), _owner(owner)
+ : _associated_memory_group(nullptr), _memory(), _mapping(nullptr), _owner(owner)
{
}
-GCTensorAllocator::~GCTensorAllocator()
-{
- _gl_buffer = support::cpp14::make_unique<GLBufferWrapper>();
-}
-
uint8_t *GCTensorAllocator::data()
{
return _mapping;
@@ -50,32 +46,28 @@
{
if(_associated_memory_group == nullptr)
{
- _gl_buffer = support::cpp14::make_unique<GLBufferWrapper>();
- ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
- ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(info().total_size()), nullptr, GL_STATIC_DRAW));
- ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+ _memory.set_owned_region(support::cpp14::make_unique<GCBufferMemoryRegion>(info().total_size()));
}
else
{
- _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_gl_buffer), info().total_size());
+ _associated_memory_group->finalize_memory(_owner, _memory, info().total_size());
}
info().set_is_resizable(false);
}
void GCTensorAllocator::free()
{
- if(_associated_memory_group == nullptr)
- {
- _gl_buffer.reset();
- info().set_is_resizable(true);
- }
+ _mapping = nullptr;
+ _memory.set_region(nullptr);
+ info().set_is_resizable(true);
}
void GCTensorAllocator::set_associated_memory_group(GCMemoryGroup *associated_memory_group)
{
ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
- ARM_COMPUTE_ERROR_ON(_gl_buffer.get() != nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region() != nullptr && _memory.gc_region()->gc_ssbo_name() != 0);
+
_associated_memory_group = associated_memory_group;
}
@@ -91,27 +83,23 @@
GLuint GCTensorAllocator::get_gl_ssbo_name() const
{
- return _gl_buffer->_ssbo_name;
+ return (_memory.region() == nullptr) ? static_cast<GLuint>(0) : _memory.gc_region()->gc_ssbo_name();
}
uint8_t *GCTensorAllocator::map(bool blocking)
{
ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
- ARM_COMPUTE_UNUSED(blocking);
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
- ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
- void *p = ARM_COMPUTE_GL_CHECK(glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, static_cast<GLsizeiptr>(info().total_size()), GL_MAP_READ_BIT | GL_MAP_WRITE_BIT));
- _mapping = reinterpret_cast<uint8_t *>(p);
-
+ _mapping = reinterpret_cast<uint8_t *>(_memory.gc_region()->map(blocking));
return _mapping;
}
void GCTensorAllocator::unmap()
{
ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
- ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
- ARM_COMPUTE_GL_CHECK(glUnmapBuffer(GL_SHADER_STORAGE_BUFFER));
- ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+ _memory.gc_region()->unmap();
_mapping = nullptr;
}
\ No newline at end of file
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index a7a56b6..c58d184 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
@@ -150,6 +150,7 @@
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
+ // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
TensorInfo im2col_reshaped_info(shape_im2col, 1, dt);
_input_im2col_reshaped.allocator()->init(im2col_reshaped_info);
_memory_group.manage(&_input_im2col_reshaped);
@@ -160,6 +161,7 @@
shape_gemm.set(1, mat_input_rows);
const DataType gemm_data_type = dt;
+ // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
_gemm_output.allocator()->init(info_gemm);
_memory_group.manage(&_gemm_output);
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index 7121654..d9aa50d 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
@@ -31,11 +31,12 @@
using namespace arm_compute;
GCDepthwiseConvolutionLayer3x3::GCDepthwiseConvolutionLayer3x3()
- : _kernel(nullptr), _border_handler(), _shift_handler()
+ : _kernel(nullptr), _border_handler(), _shift_handler(), _activationlayer_function(), _is_activationlayer_enabled(false)
{
}
-void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
{
auto k = arm_compute::support::cpp14::make_unique<GCDepthwiseConvolutionLayer3x3Kernel>();
k->configure(input, weights, biases, output, conv_info, depth_multiplier);
@@ -45,6 +46,14 @@
_border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
_shift_handler.configure(input);
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
}
void GCDepthwiseConvolutionLayer3x3::run()
@@ -54,4 +63,10 @@
GCScheduler::get().dispatch(_border_handler, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(*_kernel);
+
+ // Run Activation Layer
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp
index 5fb971c..19fdc3d 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,9 +37,15 @@
{
}
-void GCNormalizePlanarYUVLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *sd)
+void GCNormalizePlanarYUVLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *std)
{
- _norm_kernel.configure(input, output, mean, sd);
+ _norm_kernel.configure(input, output, mean, std);
+}
+
+Status GCNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *mean, const ITensorInfo *std)
+{
+ return GCNormalizePlanarYUVLayerKernel::validate(input, output, mean, std);
}
void GCNormalizePlanarYUVLayer::run()
diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index 0c8769b..dad42cd 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
@@ -34,12 +34,13 @@
{
}
-void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta)
+void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta, size_t axis)
{
- ARM_COMPUTE_UNUSED(beta);
+ ARM_COMPUTE_UNUSED(beta, axis);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON(beta != 1.0f);
+ ARM_COMPUTE_ERROR_ON_MSG(axis != 1, "Axis must be 1 for GLES");
// Create intermediate tensors shapes
_tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 54a2bd2..b2edad0 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/IScheduler.h"
+#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/CPUUtils.h"
namespace arm_compute
@@ -30,6 +31,7 @@
IScheduler::IScheduler()
: _cpu_info()
{
+ get_cpu_configuration(_cpu_info);
// Work out the best possible number of execution threads
_num_threads_hint = get_threads_hint();
}
@@ -43,4 +45,10 @@
{
return _num_threads_hint;
}
+void IScheduler::run_tagged_workloads(std::vector<Workload> &workloads, const char *tag)
+{
+ ARM_COMPUTE_UNUSED(tag);
+ run_workloads(workloads);
+}
+
} // namespace arm_compute
diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index faaff8a..7d928d6 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/IAllocator.h"
+#include "arm_compute/runtime/IMemory.h"
#include "arm_compute/runtime/IMemoryGroup.h"
#include "arm_compute/runtime/IMemoryPool.h"
#include "support/ToolchainSupport.h"
@@ -70,7 +71,7 @@
_active_elements.insert(std::make_pair(obj, obj));
}
-void ISimpleLifetimeManager::end_lifetime(void *obj, void **handle, size_t size)
+void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t size)
{
ARM_COMPUTE_ERROR_ON(obj == nullptr);
@@ -80,7 +81,7 @@
// Update object fields and mark object as complete
Element &el = active_object_it->second;
- el.handle = handle;
+ el.handle = &obj_memory;
el.size = size;
el.status = true;
diff --git a/src/runtime/MEMUtils.cpp b/src/runtime/MEMUtils.cpp
new file mode 100644
index 0000000..ad00070
--- /dev/null
+++ b/src/runtime/MEMUtils.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
+
+#ifndef BARE_METAL
+#include <fstream>
+#include <regex>
+#include <sstream>
+#endif // ifndef BARE_METAL
+
+namespace
+{
+void parse_mem_info(size_t &total, size_t &free, size_t &buffer)
+{
+ free = 0;
+ total = 0;
+ buffer = 0;
+#ifndef BARE_METAL
+ size_t memcache = 0;
+ size_t memfree = 0;
+ std::ifstream meminfo_f;
+ meminfo_f.open("/proc/meminfo", std::ios::in);
+ if(meminfo_f.is_open())
+ {
+ std::stringstream str_stream;
+ str_stream << meminfo_f.rdbuf();
+ const std::string str = str_stream.str();
+ try
+ {
+ std::smatch match;
+ if(std::regex_search(str, match, std::regex("MemTotal: (.*)kB")) && match.size() > 1)
+ {
+ const std::string result = match.str(1);
+ total = std::stoul(result, nullptr, 0);
+ }
+ if(std::regex_search(str, match, std::regex("MemFree: (.*)kB")) && match.size() > 1)
+ {
+ const std::string result = match.str(1);
+ memfree = std::stoul(result, nullptr, 0);
+ }
+ if(std::regex_search(str, match, std::regex("Buffers: (.*)kB")) && match.size() > 1)
+ {
+ const std::string result = match.str(1);
+ buffer = std::stoul(result, nullptr, 0);
+ }
+ if(std::regex_search(str, match, std::regex("Cached: (.*)kB")) && match.size() > 1)
+ {
+ const std::string result = match.str(1);
+ memcache = std::stoul(result, nullptr, 0);
+ }
+ free = memfree + (buffer + memcache);
+ }
+ catch(std::regex_error &e)
+ {
+ // failed parsing /proc/meminfo
+ // return 0s on all fields
+ }
+ }
+#endif // ifndef BARE_METAL
+}
+
+} // namespace
+
+namespace arm_compute
+{
+void MEMInfo::set_policy(MemoryPolicy policy)
+{
+ _policy = policy;
+}
+
+MemoryPolicy MEMInfo::get_policy()
+{
+ return _policy;
+}
+MemoryPolicy MEMInfo::_policy = { MemoryPolicy::NORMAL };
+
+MEMInfo::MEMInfo()
+ : _total(0), _free(0), _buffer(0)
+{
+ parse_mem_info(_total, _free, _buffer);
+}
+
+size_t MEMInfo::get_total_in_kb() const
+{
+ return _total;
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp
index 15bbb17..d116624 100644
--- a/src/runtime/Memory.cpp
+++ b/src/runtime/Memory.cpp
@@ -30,17 +30,13 @@
Memory::Memory()
: _region(nullptr), _region_owned(nullptr)
{
- create_empty_region();
}
Memory::Memory(std::shared_ptr<IMemoryRegion> memory)
: _region(nullptr), _region_owned(std::move(memory))
{
- if(_region_owned == nullptr)
- {
- create_empty_region();
- }
- _region = _region_owned.get();
+ _region_owned = memory;
+ _region = _region_owned.get();
}
Memory::Memory(IMemoryRegion *memory)
@@ -59,9 +55,15 @@
return _region;
}
-void Memory::create_empty_region()
+void Memory::set_region(IMemoryRegion *region)
{
- _region_owned = std::make_shared<MemoryRegion>(0);
+ _region_owned = nullptr;
+ _region = region;
+}
+
+void Memory::set_owned_region(std::unique_ptr<IMemoryRegion> region)
+{
+ _region_owned = std::move(region);
_region = _region_owned.get();
}
} // namespace arm_compute
diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp
index 4dfa28b..d9803a8 100644
--- a/src/runtime/MemoryManagerOnDemand.cpp
+++ b/src/runtime/MemoryManagerOnDemand.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,33 +29,15 @@
#include <memory>
-using namespace arm_compute;
-
+namespace arm_compute
+{
MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager)
- : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager)), _allocator(nullptr), _is_finalized(false), _num_pools(1)
+ : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager))
{
ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr, "Lifetime manager not specified correctly!");
ARM_COMPUTE_ERROR_ON_MSG(!_pool_mgr, "Pool manager not specified correctly!");
}
-bool MemoryManagerOnDemand::is_finalized() const
-{
- return _is_finalized;
-}
-
-void MemoryManagerOnDemand::set_num_pools(unsigned int num_pools)
-{
- ARM_COMPUTE_ERROR_ON(num_pools == 0);
- _num_pools = num_pools;
-}
-
-void MemoryManagerOnDemand::set_allocator(IAllocator *allocator)
-{
- ARM_COMPUTE_ERROR_ON_MSG(is_finalized(), "Memory manager is already finalized!");
- ARM_COMPUTE_ERROR_ON(allocator == nullptr);
- _allocator = allocator;
-}
-
ILifetimeManager *MemoryManagerOnDemand::lifetime_manager()
{
return _lifetime_mgr.get();
@@ -66,23 +48,26 @@
return _pool_mgr.get();
}
-void MemoryManagerOnDemand::finalize()
+void MemoryManagerOnDemand::populate(arm_compute::IAllocator &allocator, size_t num_pools)
{
- ARM_COMPUTE_ERROR_ON_MSG(is_finalized(), "Memory manager is already finalized!");
ARM_COMPUTE_ERROR_ON(!_lifetime_mgr);
ARM_COMPUTE_ERROR_ON(!_pool_mgr);
- ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr->are_all_finalized(), "All the objects have not been finalized! ");
- ARM_COMPUTE_ERROR_ON(_allocator == nullptr);
+ ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr->are_all_finalized(), "All the objects have not been finalized!");
+ ARM_COMPUTE_ERROR_ON_MSG(_pool_mgr->num_pools() != 0, "Pool manager already contains pools!");
// Create pools
- auto pool_template = _lifetime_mgr->create_pool(_allocator);
- for(int i = _num_pools; i > 1; --i)
+ auto pool_template = _lifetime_mgr->create_pool(&allocator);
+ for(int i = num_pools; i > 1; --i)
{
auto pool = pool_template->duplicate();
_pool_mgr->register_pool(std::move(pool));
}
_pool_mgr->register_pool(std::move(pool_template));
-
- // Set finalized to true
- _is_finalized = true;
}
+
+void MemoryManagerOnDemand::clear()
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!_pool_mgr, "Pool manager not specified correctly!");
+ _pool_mgr->clear_pools();
+}
+} //namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 7d8e3cf..677e9f6 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -29,8 +29,8 @@
#include <utility>
-using namespace arm_compute;
-
+namespace arm_compute
+{
void NEArithmeticAddition::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy)
{
auto k = arm_compute::support::cpp14::make_unique<NEArithmeticAdditionKernel>();
@@ -51,3 +51,4 @@
{
return NEArithmeticAdditionKernel::validate(input1, input2, output, policy);
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 5c0491e..ceb4b49 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,20 +23,33 @@
*/
#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
#include "support/ToolchainSupport.h"
#include <utility>
-using namespace arm_compute;
-
-void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
+namespace arm_compute
+{
+void NEArithmeticSubtraction::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy)
{
auto k = arm_compute::support::cpp14::make_unique<NEArithmeticSubtractionKernel>();
k->configure(input1, input2, output, policy);
_kernel = std::move(k);
+
+ if(output->info()->dimension(0) > 1)
+ {
+ ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
}
+
Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
{
return NEArithmeticSubtractionKernel::validate(input1, input2, output, policy);
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index d72c98b..0e5d50f 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -58,8 +58,7 @@
{
}
-void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value,
- bool use_fp16)
+void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -127,18 +126,9 @@
_memory_group.manage(&_phase);
// Configure gradient
- if(use_fp16)
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGradientFP16Kernel>();
- k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
- _gradient = std::move(k);
- }
- else
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGradientKernel>();
- k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
- _gradient = std::move(k);
- }
+ auto k = arm_compute::support::cpp14::make_unique<NEGradientKernel>();
+ k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
+ _gradient = std::move(k);
// Allocate intermediate tensors
_gx.allocator()->allocate();
diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
new file mode 100644
index 0000000..485abfe
--- /dev/null
+++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEChannelShuffleLayer::configure(const ITensor *input, ITensor *output, unsigned int num_groups)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEChannelShuffleLayerKernel>();
+ k->configure(input, output, num_groups);
+ _kernel = std::move(k);
+}
+
+Status NEChannelShuffleLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+{
+ return NEChannelShuffleLayerKernel::validate(input, output, num_groups);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index fda9f57..6887a0a 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
using namespace arm_compute;
using namespace arm_compute::misc::shape_calculator;
@@ -35,7 +36,10 @@
: _memory_group(std::move(memory_manager)),
_conv_f(),
_upsample_f(),
+ _flip_weights(),
_scaled_output(),
+ _weights_flipped(),
+ _original_weights(nullptr),
_input(nullptr),
_info(),
_inner_border(),
@@ -60,9 +64,9 @@
ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y");
auto out_dims = deconvolution_output_dimensions(input->dimension(0), input->dimension(1), weights->dimension(0), weights->dimension(1),
- info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
+ info.pad().first, info.pad().second, stride_x, stride_y);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
if(bias != nullptr)
{
@@ -73,15 +77,17 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape());
+ const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
}
- TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_deconvolution_shape(*input, stride_x, stride_y, inner_border_right, inner_border_top,
- info)));
+ unsigned int padx = 0;
+ unsigned int pady = 0;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
+ TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
@@ -99,33 +105,45 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- _input = input;
- _info = info;
- _inner_border = std::make_pair(inner_border_right, inner_border_top);
- _is_prepared = false;
+ _input = input;
+ _original_weights = weights;
+ _info = info;
+ _inner_border = std::make_pair(inner_border_right, inner_border_top);
+ _is_prepared = false;
const unsigned int stride_x = info.stride().first;
const unsigned int stride_y = info.stride().second;
+ _weights_flipped.allocator()->init(TensorInfo(weights->info()->tensor_shape(), 1, weights->info()->data_type()));
+ _flip_weights.configure(weights, &_weights_flipped);
+
+ auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+ info.pad().first, info.pad().second, stride_x, stride_y);
+
+ const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
_memory_group.manage(&_scaled_output);
- // configure scale function
- // Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor
- const TensorInfo scale_out_info(compute_deconvolution_shape(*input->info(), stride_x, stride_y, inner_border_right, inner_border_top, info), 1, input->info()->data_type());
+ // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
+ unsigned int padx = 0;
+ unsigned int pady = 0;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
+
+ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
_scaled_output.allocator()->init(scale_out_info);
+ const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
+ _upsample_f.configure(input, &_scaled_output, upsample_info, inner_border_right, inner_border_top);
+
// setup the function to convolve the upscaled output
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
- _conv_f.configure(&_scaled_output, weights, bias, output, conv_info);
-
- // Allocate auxiliary tensors
+ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
_scaled_output.allocator()->allocate();
-
- // configure upsample function
- _upsample_f.configure(input, &_scaled_output, info, inner_border_right, inner_border_top);
}
void NEDeconvolutionLayer::run()
@@ -144,7 +162,21 @@
{
if(!_is_prepared)
{
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights flipping and mark original weights tensor as unused
+ _weights_flipped.allocator()->allocate();
+ NEScheduler::get().schedule(&_flip_weights, Window::DimZ);
+ _original_weights->mark_as_unused();
+
+ // Prepare convolution
_conv_f.prepare();
+
+ if(!_weights_flipped.is_used())
+ {
+ _weights_flipped.allocator()->free();
+ }
+
_is_prepared = true;
}
-}
\ No newline at end of file
+}
diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index 9a75404..0041c1f 100644
--- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,9 +30,14 @@
using namespace arm_compute;
-void NEDepthConvertLayer::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEDepthConvertLayer::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
{
auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertLayerKernel>();
k->configure(input, output, policy, shift);
_kernel = std::move(k);
}
+
+Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+{
+ return NEDepthConvertLayerKernel::validate(input, output, policy, shift);
+}
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 24b12f4..a2f0094 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -36,14 +36,16 @@
using namespace arm_compute::misc::shape_calculator;
NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
- : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(),
- _has_bias(false), _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true), _permute(false)
+ : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _accumulator(), _permuted_input(),
+ _permuted_weights(), _permuted_output(), _has_bias(false), _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true), _permute(false),
+ _is_activationlayer_enabled(false)
{
}
-void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
PixelValue zero_value(0.f);
@@ -59,8 +61,25 @@
_is_nchw = input->info()->data_layout() == DataLayout::NCHW;
_permute = _is_optimized == _is_nchw;
+ // Initialize the intermediate accumulator tensor in case of quantized input
+ if(_is_quantized)
+ {
+ TensorShape accum_shape = output->info()->tensor_shape();
+ DataLayout accum_layout = output->info()->data_layout();
+ if(!_is_optimized && !_is_nchw)
+ {
+ permute(accum_shape, PermutationVector(1U, 2U, 0U));
+ accum_layout = DataLayout::NCHW;
+ }
+
+ _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, input->info()->quantization_info()));
+ _accumulator.info()->set_data_layout(accum_layout);
+ zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+ }
+
if(_is_optimized)
{
+ ITensor *optimized_output = (_is_quantized) ? &_accumulator : output;
if(_is_nchw)
{
// Configure the function to transform the input tensor from NCHW -> NHWC
@@ -75,8 +94,8 @@
_dwc_kernel.configure(&_permuted_input, &_permuted_weights, &_permuted_output, conv_info, depth_multiplier, DataLayout::NHWC);
// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
- _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
- _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+ _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+ _permute_output.configure(&_permuted_output, optimized_output, PermutationVector(1U, 2U, 0U));
// Allocate tensors
_permuted_input.allocator()->allocate();
@@ -85,26 +104,11 @@
}
else
{
- _dwc_kernel.configure(input, weights, output, conv_info, depth_multiplier, DataLayout::NHWC);
+ _dwc_kernel.configure(input, weights, optimized_output, conv_info, depth_multiplier, DataLayout::NHWC);
}
}
else
{
- // Allocate the intermediate accumulator tensor in case of quantized input
- if(_is_quantized)
- {
- TensorShape accum_shape = output->info()->tensor_shape();
-
- if(!_is_nchw)
- {
- permute(accum_shape, PermutationVector(1U, 2U, 0U));
- }
-
- _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32));
- _accumulator.info()->set_quantization_info(input->info()->quantization_info());
- zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
- }
-
if(!_is_nchw)
{
// Configure the function to transform the input tensor from NHWC -> NCHW
@@ -143,7 +147,7 @@
float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
int output_multiplier, output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
+ _output_stage_kernel.configure(&_accumulator, biases, (_is_nchw || _is_optimized) ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
_accumulator.allocator()->allocate();
}
else if(_has_bias)
@@ -157,21 +161,46 @@
_permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
_permuted_output.allocator()->allocate();
}
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
}
Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier)
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
if(biases != nullptr)
{
+ const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
}
- return NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, output, conv_info, depth_multiplier);
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
+
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
+ }
+
+ //Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
+ }
+
+ return Status{};
}
void NEDepthwiseConvolutionLayer3x3::run()
@@ -222,16 +251,22 @@
{
_permute_output.run();
}
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
}
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
: _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _permute_input(),
- _permute_weights(), _permute_output(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _permuted_input(), _permuted_weights(), _permuted_output(), _is_prepared(false),
- _is_quantized(false), _is_nhwc(false), _original_weights(nullptr)
+ _permute_weights(), _permute_output(), _activationlayer_function(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _permuted_input(), _permuted_weights(),
+ _permuted_output(), _is_prepared(false), _is_quantized(false), _is_nhwc(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
{
}
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
{
const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_UNUSED(channel_idx);
@@ -353,13 +388,24 @@
// Allocate intermediate tensors
_input_reshaped.allocator()->allocate();
_v2mm_output.allocator()->allocate();
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
}
Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier)
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
+ const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
// Clone output to use auto init
auto output_clone = output->clone();
@@ -391,8 +437,8 @@
const size_t weights_w = weights_to_use->dimension(0);
const size_t weights_h = weights_to_use->dimension(1);
const size_t weights_z = weights_to_use->dimension(2);
- const unsigned int conv_w = output_shape.x();
- const unsigned int conv_h = output_shape.y();
+ const unsigned int conv_w = output_shape[width_idx];
+ const unsigned int conv_h = output_shape[height_idx];
const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
const size_t conv_size = conv_w * conv_h;
@@ -438,6 +484,12 @@
ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output_to_use));
}
+ // Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
+ }
+
return Status{};
}
@@ -463,6 +515,11 @@
{
_permute_output.run();
}
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
}
void NEDepthwiseConvolutionLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index 1814d61..57bef2b 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -23,7 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
#include "arm_compute/core/Size2D.h"
#include "support/ToolchainSupport.h"
@@ -31,7 +31,12 @@
void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
{
- auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
- k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, false, true);
+ auto k = arm_compute::support::cpp14::make_unique<NEFlattenLayerKernel>();
+ k->configure(input, output);
_kernel = std::move(k);
+}
+
+Status NEFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return NEFlattenLayerKernel::validate(input, output);
}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
index 0000cdd..8179188 100644
--- a/src/runtime/NEON/functions/NEFloor.cpp
+++ b/src/runtime/NEON/functions/NEFloor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,11 +26,17 @@
#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
void NEFloor::configure(const ITensor *input, ITensor *output)
{
auto k = arm_compute::support::cpp14::make_unique<NEFloorKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
+
+Status NEFloor::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return NEFloorKernel::validate(input, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index f1606aa..45e21b5 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -50,6 +50,7 @@
// Validate gemmlowp function
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info),
&weights.clone()->set_quantization_info(weights_quantization_info),
+ nullptr,
&output));
}
else
@@ -74,8 +75,8 @@
}
NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _im2col_kernel(), _convert_weights(), _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), _accumulate_biases_kernel(),
- _im2col_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false),
+ : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), _accumulate_biases_kernel(),
+ _flatten_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false),
_is_fc_after_conv(false), _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
{
}
@@ -93,7 +94,7 @@
weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
// Configure gemmlowp function
- _mm_gemmlowp.configure(input, weights, output);
+ _mm_gemmlowp.configure(input, weights, nullptr, output);
// Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
input->info()->set_quantization_info(input_quantization_info);
@@ -112,19 +113,19 @@
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
- // Initialize output tensor for im2col
- TensorShape shape_im2col = compute_flatten_shape(input->info());
- _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+ // Initialize output tensor for flatten
+ TensorShape shape_flatten = compute_flatten_shape(input->info());
+ _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
- // Configure im2col kernel
- _memory_group.manage(&_im2col_output);
- _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, true);
+ // Configure flatten kernel
+ _memory_group.manage(&_flatten_output);
+ _flatten_kernel.configure(input, &_flatten_output);
// Configure matrix multiply kernel
- configure_mm(&_im2col_output, weights, output);
+ configure_mm(&_flatten_output, weights, output);
- // Allocate the output tensor for im2col once all the configure methods have been called
- _im2col_output.allocator()->allocate();
+ // Allocate the output tensor for flatten once all the configure methods have been called
+ _flatten_output.allocator()->allocate();
}
void NEFullyConnectedLayer::configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output)
@@ -249,7 +250,7 @@
bool is_fc_after_conv = true;
bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)));
+ const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)));
const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
const ITensorInfo &gemmlowp_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
@@ -307,9 +308,9 @@
// Fully Connected layer after a Convolution Layer without batches
ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
- // Validate im2col kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, true));
- input_to_use = &im2col_input;
+ // Validate flatten kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
+ input_to_use = &flatten_input;
}
else
{
@@ -337,7 +338,7 @@
// Linearize input if it comes from a convolutional layer
if(_is_fc_after_conv)
{
- NEScheduler::get().schedule(&_im2col_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
}
// Run matrix multiply
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index de51266..72a3e80 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -62,7 +62,14 @@
if(run_optimised)
{
- _asm_glue.configure(a, b, d, alpha, beta, _reshape_b_only_on_first_run);
+ if(MEMInfo::get_policy() == MemoryPolicy::MINIMIZE)
+ {
+ _asm_glue.configure(a, b, d, alpha, beta, false);
+ }
+ else
+ {
+ _asm_glue.configure(a, b, d, alpha, beta, _reshape_b_only_on_first_run);
+ }
ARM_COMPUTE_ERROR_ON(!_asm_glue.is_configured());
}
else
@@ -132,7 +139,7 @@
ARM_COMPUTE_UNUSED(alpha);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
@@ -140,7 +147,7 @@
if(c != nullptr)
{
- ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
@@ -150,7 +157,7 @@
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
- if(gemm_info.depth_output_gemm3d() != 1)
+ if(gemm_info.depth_output_gemm3d() != 0)
{
if(gemm_info.reinterpret_input_as_3d())
{
@@ -174,7 +181,7 @@
if(!run_optimised)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 1, "NEGEMM cannot reinterpret the output tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
// Check if the first input tensor is a vector.
const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
@@ -218,6 +225,12 @@
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
}
+ // Validate matrix addition kernel
+ if(beta != 0 && c != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAdditionKernel::validate(c, output, beta));
+ }
+
return Status{};
}
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 29db654..922f757 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -45,6 +45,7 @@
//Note: It's safe to not check for FP16 support because this was already checked in NEGEMMAssemblyDispatch::configure()
switch(method)
{
+ case arm_gemm::GemmMethod::GEMM_INTERLEAVED_FP16:
case arm_gemm::GemmMethod::GEMM_INTERLEAVED:
{
if(!pretranspose_hint)
@@ -227,7 +228,7 @@
// Forcing 128-byte alignment (required by 32-bit kernels)
const unsigned int alignment = 128;
const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
- _pretranspose.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment) }, 1, DataType::S8), alignment);
+ _pretranspose.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
_pretranspose.allocator()->allocate();
ARM_COMPUTE_ERROR_ON_NULLPTR(_pretranspose.buffer());
}
@@ -258,7 +259,7 @@
void Fallback<TypeInput, TypeOutput>::allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment)
{
ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0");
- _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment) }, 1, DataType::S8), alignment);
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
memory_group.manage(&_workspace);
_workspace.allocator()->allocate();
}
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 92e641e..0232a83 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -32,6 +32,7 @@
#include "support/ToolchainSupport.h"
#include <cmath>
+#include <set>
#include <tuple>
using namespace arm_compute;
@@ -100,6 +101,9 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info(), gemm_3d_depth, _skip_im2col));
+ const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+ gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
+
if(_is_quantized)
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -110,7 +114,7 @@
input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
- _mm_gemmlowp.configure(input, weights, output, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+ _mm_gemmlowp.configure(input, weights, nullptr, output, gemm_info);
// Revert back QuantizatioInfo as input and weights could be used in other convolution layers
input->info()->set_quantization_info(input_quantization_info);
@@ -119,8 +123,7 @@
else
{
// Configure matrix multiply function
- _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, gemm_3d_depth,
- _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */));
+ _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
}
}
@@ -128,7 +131,8 @@
{
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, skip_im2col);
+ const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+ gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
if(is_quantized)
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -142,7 +146,7 @@
weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
// Perform validation step on GEMMLowp
- return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), output, gemm_info);
+ return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), nullptr, output, gemm_info);
}
else
{
@@ -185,19 +189,18 @@
const DataLayout data_layout = input->info()->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
const unsigned int kernel_width = weights->info()->dimension(idx_width);
const unsigned int kernel_height = weights->info()->dimension(idx_height);
- _is_prepared = weights_info.retain_internal_weights();
- _original_weights = weights;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _data_layout = data_layout;
- _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
- _skip_col2im = data_layout == DataLayout::NHWC;
- _append_bias = (biases != nullptr) && (!_is_quantized);
+ _is_prepared = weights_info.retain_internal_weights();
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _data_layout = data_layout;
+ _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+ _append_bias = (biases != nullptr) && (!_is_quantized);
+ _is_activationlayer_enabled = act_info.enabled();
const ITensor *gemm_input_to_use = input;
ITensor *gemm_output_to_use = output;
@@ -214,17 +217,20 @@
dilation);
// Check if GEMM3D is supported
- if(_skip_col2im)
+ if(data_layout == DataLayout::NHWC)
{
+ _skip_col2im = bool(validate_gemm3d(input->info()->data_type(), conv_h, true));
// If not supported, we need to perform im2col and col2im (or reshape layer)
- if(!bool(validate_gemm3d(input->info()->data_type(), conv_h, _skip_im2col)))
+ if(!_skip_col2im)
{
_skip_im2col = false;
- _skip_col2im = false;
}
}
+ else
+ {
+ _skip_col2im = false;
+ }
- const unsigned bias_element = (_append_bias && !_skip_im2col) ? 1 : 0;
const ITensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr;
// Get parameters from conv_info
@@ -233,7 +239,6 @@
std::tie(stride_x, stride_y) = conv_info.stride();
unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels);
- unsigned int mat_weights_rows = weights->info()->dimension(idx_width) * weights->info()->dimension(idx_height) * weights->info()->dimension(idx_channel) + bias_element;
// _weights_reshaped will be auto configured in the kernel.
// Just append biases and do not transpose 1xW as it will be reshaped in NEGEMM
@@ -242,14 +247,6 @@
// Create tensor to store im2col reshaped inputs
if(!_skip_im2col)
{
- // Calculate im2col shape
- // For NEON the batch size is on the fourth dimension
- TensorShape shape_im2col = input->info()->tensor_shape();
- shape_im2col.set(0, mat_weights_rows);
- shape_im2col.set(1, conv_w * conv_h);
- shape_im2col.set(2, 1);
-
- _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
_memory_group.manage(&_im2col_output);
// Configure
@@ -265,17 +262,27 @@
}
// Create temporary GEMM output tensor in case we cannot skip col2im
- if(!_skip_col2im)
+ if(!_skip_col2im || _is_quantized)
{
- // Calculate GEMM output shape
- TensorShape shape_gemm = _im2col_output.info()->tensor_shape();
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, conv_w * conv_h);
-
// GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
const DataType gemm_data_type = _is_quantized ? DataType::S32 : data_type;
+ TensorShape shape_gemm;
+
+ if(_is_quantized && _skip_col2im)
+ {
+ shape_gemm = output->info()->tensor_shape();
+ }
+ else
+ {
+ // Calculate GEMM output shape
+ shape_gemm = _im2col_output.info()->tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, conv_w * conv_h);
+ }
+
+ // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
- info_gemm.set_quantization_info(output->info()->quantization_info());
+ info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
_gemm_output.allocator()->init(info_gemm);
_memory_group.manage(&_gemm_output);
@@ -284,7 +291,9 @@
}
// Configure GEMM
- configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, _skip_col2im ? conv_h : 1);
+ // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
+ const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
+ configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, gemm_3d_depth);
if(!_skip_im2col)
{
@@ -294,16 +303,39 @@
// Configure output stage for quantized case
if(_is_quantized)
{
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+ const QuantizationInfo input_quant_info = input->info()->quantization_info();
+ const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input_quant_info : output->info()->quantization_info();
- float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+ float multiplier = input_quant_info.scale * weights->info()->quantization_info().scale / output_quant_info.scale;
int output_multiplier, output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _memory_group.manage(&_tmp_output);
- gemm_output_staged_to_use = &_tmp_output;
+ if(!_skip_col2im)
+ {
+ _memory_group.manage(&_tmp_output);
+ gemm_output_staged_to_use = &_tmp_output;
+ }
- _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset);
+ // Merge activation with output stage
+ int min_activation = 0;
+ int max_activation = 0;
+
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+ if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+ {
+ const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+ const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+ max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+
+ _is_activationlayer_enabled = false;
+ }
+
+ _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset, min_activation, max_activation);
}
if(!_skip_col2im)
@@ -320,12 +352,12 @@
}
}
- if(_is_quantized)
+ if(_is_quantized && !_skip_col2im)
{
_tmp_output.allocator()->allocate();
}
- if(!_skip_col2im)
+ if(!_skip_col2im || _is_quantized)
{
_gemm_output.allocator()->allocate();
}
@@ -333,9 +365,7 @@
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
"Output shape does not match the expected one");
- //Configure Activation Layer
- _is_activationlayer_enabled = act_info.enabled();
-
+ // Configure Activation Layer
if(_is_activationlayer_enabled)
{
_activationlayer_function.configure(output, nullptr, act_info);
@@ -370,10 +400,10 @@
const ITensorInfo *gemm_output_staged_to_use = output;
const ITensorInfo *weights_to_use = weights;
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const bool append_bias = (biases != nullptr) && (!is_quantized);
- bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
- bool skip_col2im = data_layout == DataLayout::NHWC;
+ const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+ const bool append_bias = (biases != nullptr) && (!is_quantized);
+ bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+ bool is_activation_enabled = act_info.enabled();
// Get convolved dimensions
unsigned int conv_w = 0;
@@ -387,6 +417,17 @@
dilation);
// Check if GEMM3D is supported
+ bool skip_col2im = false;
+ if(data_layout == DataLayout::NHWC)
+ {
+ skip_col2im = bool(validate_gemm3d(input->data_type(), conv_h, true));
+ // If not supported, we need to perform im2col and col2im (or reshape layer)
+ if(!skip_col2im)
+ {
+ skip_im2col = false;
+ }
+ }
+
if(skip_col2im)
{
// If not supported, we need to perform im2col and col2im (or reshape layer)
@@ -435,6 +476,7 @@
{
// Create tensor info for im2col reshaped inputs
// For NEON the batch size is on the fourth dimension
+ // TODO (giaiod01): Auto-initialize the output shape of im2col COMPMID-1482
TensorShape shape_im2col = input->tensor_shape();
shape_im2col.set(0, mat_weights_rows);
shape_im2col.set(1, conv_w * conv_h);
@@ -453,33 +495,60 @@
}
// Create temporary GEMM output tensor in case we cannot skip col2im
+ const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type;
if(!skip_col2im)
{
TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, conv_w * conv_h);
- const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type;
- // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
info_gemm = TensorInfo(shape_gemm, 1, gemm_data_type);
- info_gemm.set_quantization_info(output->quantization_info());
-
- gemm_output_to_use = &info_gemm;
}
+ else
+ {
+ info_gemm = TensorInfo(output->tensor_shape(), 1, gemm_data_type);
+ }
+ info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
+ gemm_output_to_use = &info_gemm;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, skip_col2im ? conv_h : 1, skip_im2col));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, skip_col2im ? conv_h : 0, skip_im2col));
if(is_quantized)
{
- float multiplier = input->quantization_info().scale * weights_to_use->quantization_info().scale / output->quantization_info().scale;
- int output_multiplier, output_shift;
+ const QuantizationInfo input_quant_info = input->quantization_info();
+ const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input_quant_info : output->quantization_info();
+ const float multiplier = input_quant_info.scale * weights_to_use->quantization_info().scale / output_quant_info.scale;
+ int output_multiplier, output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8);
- tmp_info.set_quantization_info(output->quantization_info());
- gemm_output_staged_to_use = &tmp_info;
+ if(!skip_col2im)
+ {
+ tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8);
+ tmp_info.set_quantization_info(output->quantization_info()).set_data_layout(data_layout);
+ gemm_output_staged_to_use = &tmp_info;
+ }
+
+ // Merge activation with output stage
+ int min_activation = 0;
+ int max_activation = 0;
+
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+
+ if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+ {
+ const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+ const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+ max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+
+ is_activation_enabled = false;
+ }
// Validate output stage for quantized case
- NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, output->quantization_info().offset);
+ NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, min_activation, max_activation);
}
// Validate Col2Im/ReshapeLayer
@@ -491,7 +560,7 @@
}
//Validate Activation Layer
- if(act_info.enabled())
+ if(is_activation_enabled)
{
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 828011d..4b02694 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -47,10 +47,11 @@
{
}
-void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info)
+void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
+ ARM_COMPUTE_UNUSED(c);
+ ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
// Clear state
_mtx_a_reshape_kernel = nullptr;
@@ -181,49 +182,76 @@
}
}
-Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
"The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),
- "The output matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
- "The output matrix must have the same number of columns as the matrix B");
- ARM_COMPUTE_UNUSED(gemm_info);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 1, "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the output tensor as 3D");
- int32_t a_offset = a->quantization_info().offset;
- int32_t b_offset = b->quantization_info().offset;
- bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+ int32_t a_offset = a->quantization_info().offset;
+ int32_t b_offset = b->quantization_info().offset;
+ const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
- if(!run_vector_matrix_multiplication)
+ // Check if we need to run the optimized assembly kernel
+ const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, output, 1.f, 0.f, reshape_b_only_on_first_run));
+
+ if(run_optimised)
{
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorShape shape_tmp_a = a->tensor_shape();
- shape_tmp_a.set(0, a->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorShape shape_tmp_b = b->tensor_shape();
- shape_tmp_b.set(0, b->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
- TensorInfo info_a(shape_tmp_a, 1, a->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->data_type());
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+ if(gemm_info.depth_output_gemm3d() != 0)
+ {
+ if(gemm_info.reinterpret_input_as_3d())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ }
+ }
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
+
+ const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+ if(!run_vector_matrix_multiplication)
+ {
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorShape shape_tmp_a = a->tensor_shape();
+ shape_tmp_a.set(0, a->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorShape shape_tmp_b = b->tensor_shape();
+ shape_tmp_b.set(0, b->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+ TensorInfo info_a(shape_tmp_a, 1, a->data_type());
+ TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+ }
}
TensorInfo info_vector_sum_col, info_vector_sum_row;
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index 8c02436..ce69fa0 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index 25e28d2..db5e926 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,7 +61,7 @@
void NEHarrisCorners::configure(IImage *input, float threshold, float min_dist,
float sensitivity, int32_t gradient_size, int32_t block_size, KeyPointArray *corners,
- BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
+ BorderMode border_mode, uint8_t constant_border_value)
{
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -126,62 +126,31 @@
// Manage intermediate buffers
_memory_group.manage(&_score);
- if(use_fp16)
+ // Set/init Harris Score kernel accordingly with block_size
+ switch(block_size)
{
- switch(block_size)
+ case 3:
{
- case 3:
- {
- auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<3>>();
- k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
- _harris_score = std::move(k);
- }
- break;
- case 5:
- {
- auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<5>>();
- k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
- _harris_score = std::move(k);
- }
- break;
- case 7:
- {
- auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<7>>();
- k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
- _harris_score = std::move(k);
- }
- default:
- break;
+ auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<3>>();
+ k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+ _harris_score = std::move(k);
}
- }
- else
- {
- // Set/init Harris Score kernel accordingly with block_size
- switch(block_size)
+ break;
+ case 5:
{
- case 3:
- {
- auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<3>>();
- k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
- _harris_score = std::move(k);
- }
- break;
- case 5:
- {
- auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<5>>();
- k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
- _harris_score = std::move(k);
- }
- break;
- case 7:
- {
- auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<7>>();
- k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
- _harris_score = std::move(k);
- }
- default:
- break;
+ auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<5>>();
+ k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+ _harris_score = std::move(k);
}
+ break;
+ case 7:
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<7>>();
+ k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+ _harris_score = std::move(k);
+ }
+ default:
+ break;
}
// Configure border filling before harris score
diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index 4245b65..9102fca 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp
@@ -34,18 +34,17 @@
{
}
-void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups,
- bool is_fully_connected, bool is_flatten)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups)
{
_y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
- _kernel.configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten);
+ _kernel.configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups);
}
Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
- unsigned int num_groups, bool is_fully_connected, bool is_flatten)
+ unsigned int num_groups)
{
- return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten);
+ return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups);
}
void NEIm2Col::run()
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
new file mode 100644
index 0000000..7c7580a
--- /dev/null
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -0,0 +1,545 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+NELSTMLayer::NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _gemm_input_gate(), _transpose_input_gate(), _accum_input_gate1(), _accum_input_gate2(), _subtract_input_gate(),
+ _pixelwise_mul_input_gate(), _activation_input_gate(), _fully_connected_forget_gate(), _gemm_forget_gate(), _transpose_forget_gate(), _accum_forget_gate1(), _accum_forget_gate2(),
+ _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state(), _accum_cell_state1(), _accum_cell_state2(),
+ _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(),
+ _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(),
+ _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _input_gate_out5(),
+ _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(),
+ _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false),
+ _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false)
+{
+}
+
+void NELSTMLayer::configure(const ITensor *input,
+ const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
+ const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
+ const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
+ const ITensor *output_state_in, const ITensor *cell_state_in,
+ ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output,
+ const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input,
+ input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias,
+ output_state_in, cell_state_in,
+ scratch_buffer, output_state_out, cell_state_out, output);
+
+ // Set lstm parameters
+ LSTMParams<ITensorInfo> lstm_params_info;
+ if(lstm_params.has_peephole_opt())
+ {
+ lstm_params_info.set_peephole_params(lstm_params.cell_to_forget_weights()->info(), lstm_params.cell_to_output_weights()->info());
+ }
+ if(lstm_params.has_projection())
+ {
+ lstm_params_info.set_projection_params(lstm_params.projection_weights()->info(),
+ lstm_params.projection_bias() != nullptr ? lstm_params.projection_bias()->info() : nullptr);
+ }
+ if(!lstm_params.has_cifg_opt())
+ {
+ const ITensorInfo *cell_to_input_weights_info = (lstm_params.has_peephole_opt()) ? lstm_params.cell_to_input_weights()->info() : nullptr;
+ lstm_params_info.set_cifg_params(lstm_params.input_to_input_weights()->info(), lstm_params.recurrent_to_input_weights()->info(),
+ cell_to_input_weights_info, lstm_params.input_gate_bias()->info());
+ }
+
+ // Validate
+ ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(input->info(), input_to_forget_weights->info(),
+ input_to_cell_weights->info(), input_to_output_weights->info(),
+ recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+ forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+ output_state_in->info(), cell_state_in->info(),
+ scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+ lstm_params_info, activation_info, cell_threshold, projection_threshold));
+
+ const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
+
+ // Configure block that calculates the forget gate
+ // forget_gate = Activation(input * input_to_forget_weights + output_state_in * recurrent_to_forget_weights + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
+ TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ _forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
+ _forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _forget_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ _memory_group.manage(&_forget_gate_out1);
+ _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1);
+ _memory_group.manage(&_forget_gate_out2);
+ _transpose_forget_gate.configure(recurrent_to_forget_weights, &_forget_gate_out2);
+ _memory_group.manage(&_forget_gate_out3);
+ _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
+ _forget_gate_out2.allocator()->allocate();
+ _memory_group.manage(&_forget_gate_out5);
+ _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+ Tensor *forget_gate_out = &_forget_gate_out5;
+
+ if(lstm_params.has_peephole_opt())
+ {
+ _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ _run_peephole_opt = true;
+ _memory_group.manage(&_forget_gate_out4);
+ _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _accum_forget_gate2.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+ _forget_gate_out4.allocator()->allocate();
+ _forget_gate_out5.allocator()->allocate();
+ forget_gate_out = &_forget_gate_out3;
+ }
+ else
+ {
+ _forget_gate_out3.allocator()->allocate();
+ }
+ _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ forget_gate_out->allocator()->allocate();
+
+ // Configure block that calculates the input gate
+ // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
+ // input_gate = 1 - forget_gate, with CIFG
+ _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ if(lstm_params.has_cifg_opt())
+ {
+ _memory_group.manage(&_input_gate_out1);
+ _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _ones.allocator()->allocate();
+ _run_cifg_opt = true;
+ }
+ else
+ {
+ TensorShape input_gate_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+
+ _input_gate_out2.allocator()->init(TensorInfo(input_gate_shape, 1, input->info()->data_type()));
+ _input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _input_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _input_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ _memory_group.manage(&_input_gate_out1);
+ _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1);
+ _memory_group.manage(&_input_gate_out2);
+ _transpose_input_gate.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+ _memory_group.manage(&_input_gate_out3);
+ _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
+ _input_gate_out2.allocator()->allocate();
+ _memory_group.manage(&_input_gate_out4);
+ _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+ if(_run_peephole_opt)
+ {
+ _memory_group.manage(&_input_gate_out5);
+ _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _input_gate_out5.allocator()->allocate();
+ }
+ _input_gate_out3.allocator()->allocate();
+ _input_gate_out4.allocator()->allocate();
+ _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ }
+
+ // Configure block that calculates the cell state
+ // cell_state = Clip((PixelwiseMul(input_gate, Activation(input * input_to_cell_weights + output_state_in * recurrent_to_cell_weights + cell_bias)) + PixelwiseMul(forget_gate, cell_state)), cell_threshold)
+ TensorShape cell_state1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ _cell_state_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _cell_state_out2.allocator()->init(TensorInfo(cell_state1_shape, 1, input->info()->data_type()));
+ _cell_state_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _cell_state_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ _memory_group.manage(&_cell_state_out1);
+ _fully_connected_cell_state.configure(input, input_to_cell_weights, cell_bias, &_cell_state_out1);
+ _memory_group.manage(&_cell_state_out2);
+ _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2);
+ _memory_group.manage(&_cell_state_out3);
+ _gemm_cell_state1.configure(output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+ _cell_state_out2.allocator()->allocate();
+ _memory_group.manage(&_cell_state_out4);
+ _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+ _activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
+ _memory_group.manage(&_cell_state_out5);
+ _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _input_gate_out1.allocator()->allocate();
+ _cell_state_out4.allocator()->allocate();
+ _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _forget_gate_out1.allocator()->allocate();
+ _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+ _cell_state_out3.allocator()->allocate();
+ _cell_state_out5.allocator()->allocate();
+ // Perform clipping
+ if(cell_threshold != 0.f)
+ {
+ _perform_cell_clipping = true;
+ _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+ }
+
+ // Configure block that calculates the output
+ // output_state_out = Activation(input * input_to_output_weights + output_state_in * recurrent_to_output_weights + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
+ TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ _output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
+ _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _output5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ _memory_group.manage(&_output1);
+ _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1);
+ _memory_group.manage(&_output2);
+ _transpose_output.configure(recurrent_to_output_weights, &_output2);
+ _memory_group.manage(&_output3);
+ _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
+ _output2.allocator()->allocate();
+ _memory_group.manage(&_output5);
+ _accum_output1.configure(&_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
+ _output3.allocator()->allocate();
+ Tensor *output_gate_out = &_output5;
+ if(lstm_params.has_peephole_opt())
+ {
+ _output4.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
+
+ _memory_group.manage(&_output4);
+ _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _accum_output2.configure(&_output5, &_output4, &_output1, ConvertPolicy::SATURATE);
+ _output5.allocator()->allocate();
+ output_gate_out = &_output1;
+
+ // Allocate intermediate buffers
+ _output4.allocator()->allocate();
+ }
+ else
+ {
+ _output1.allocator()->allocate();
+ }
+ _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ output_gate_out->allocator()->allocate();
+
+ // Configure block that calculates the output state
+ /** lstm_res = PixelwiseMul(output, Activation(cell_state))
+ *
+ * -- Clip(lstm_res * projection_weights + projection_bias, projection_threshold) , if there is a projection
+ * /
+ * output_state = --
+ * \
+ * -- lstm_res , otherwise
+ */
+ ITensor *output_state_out_tmp = lstm_params.has_projection() ? &_output_state1 : output_state_out;
+ _cell_state_activation.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _output_state1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ _memory_group.manage(&_cell_state_activation);
+ _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
+ _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _cell_state_activation.allocator()->allocate();
+
+ if(lstm_params.has_projection())
+ {
+ _has_projection_weights = true;
+ _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+ _output_state1.allocator()->allocate();
+ // Perform clipping
+ if(projection_threshold != 0.f)
+ {
+ _perform_projection_clipping = true;
+ _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+ }
+ }
+
+ // Copy cell state and output
+ _copy_cell_state.configure(&_cell_state_out1, cell_state_out);
+ _cell_state_out1.allocator()->allocate();
+ _copy_output.configure(output_state_out, output);
+
+ // Vector for holding the tensors to store in scratch buffer
+ std::vector<ITensor *> scratch_inputs;
+ if(!lstm_params.has_cifg_opt())
+ {
+ scratch_inputs.emplace_back(&_input_gate_out1);
+ }
+ scratch_inputs.emplace_back(&_cell_state_out1);
+ scratch_inputs.emplace_back(forget_gate_out);
+ scratch_inputs.emplace_back(output_gate_out);
+ _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+}
+
+Status NELSTMLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
+ const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
+ const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
+ const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
+ input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias,
+ output_state_in, cell_state_in,
+ scratch_buffer, output_state_out, cell_state_out, output);
+
+ // Check data types
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
+ input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias,
+ output_state_in, cell_state_in,
+ scratch_buffer, output_state_out, cell_state_out, output);
+
+ // Check dimensions
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(input_to_forget_weights->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(input_to_cell_weights->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_forget_weights->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_cell_weights->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_gate_bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(scratch_buffer->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
+ && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+
+ const unsigned int num_batches = input->dimension(1);
+ const unsigned int num_cells = input_to_output_weights->dimension(1);
+
+ // Check peephole optimization
+ if(lstm_params.has_peephole_opt())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_output_weights()->num_dimensions() > 1);
+ }
+
+ TensorShape units_out_transposed_shape = compute_transposed_shape(*recurrent_to_output_weights);
+ TensorShape num_units_transposed_shape = compute_transposed_shape(*forget_gate_bias);
+ const TensorInfo units_out_transposed_info = TensorInfo(units_out_transposed_shape, 1, input->data_type());
+ const TensorInfo num_units_transposed_info = TensorInfo(num_units_transposed_shape, 1, input->data_type());
+
+ TensorInfo input_gate = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
+ TensorInfo forget_gate = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
+ TensorInfo output_gate_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
+ TensorInfo cell_state_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
+
+ // Validate forget gate
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ if(lstm_params.has_peephole_opt())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ }
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+ // Validate input gate
+ if(!lstm_params.has_cifg_opt())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+ lstm_params.recurrent_to_input_weights(),
+ lstm_params.input_gate_bias());
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &input_gate, 1.f, 0.f, GEMMInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+ if(lstm_params.has_peephole_opt())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+ }
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtractionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ }
+
+ // Validate cell state
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_cell_weights, cell_bias, &cell_state_tmp));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, nullptr, activation_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+ if(cell_threshold != 0.f)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold,
+ cell_threshold)));
+ }
+
+ // Validate output gate tmp
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &output_gate_tmp, 1.f, 0.f, GEMMInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+ if(lstm_params.has_peephole_opt())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+ }
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+ // Validate output state
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ if(lstm_params.has_projection())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
+ if(projection_threshold != 0.f)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(output_state_out, output_state_out,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+ }
+ }
+
+ // Validate copy kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(&cell_state_tmp, cell_state_out));
+ ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(output_state_out, output));
+
+ // Validate scratch concatenation
+ std::vector<ITensorInfo *> inputs_vector_info_raw;
+ if(!lstm_params.has_cifg_opt())
+ {
+ inputs_vector_info_raw.push_back(&input_gate);
+ }
+ inputs_vector_info_raw.push_back(&cell_state_tmp);
+ inputs_vector_info_raw.push_back(&forget_gate);
+ inputs_vector_info_raw.push_back(&output_gate_tmp);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer));
+ return Status{};
+}
+
+void NELSTMLayer::run()
+{
+ _memory_group.acquire();
+
+ _fully_connected_forget_gate.run();
+ NEScheduler::get().schedule(&_transpose_forget_gate, Window::DimY);
+ _gemm_forget_gate.run();
+ NEScheduler::get().schedule(&_accum_forget_gate1, Window::DimY);
+
+ if(_run_peephole_opt)
+ {
+ NEScheduler::get().schedule(&_pixelwise_mul_forget_gate, Window::DimY);
+ _accum_forget_gate2.run();
+ }
+ NEScheduler::get().schedule(&_activation_forget_gate, Window::DimY);
+
+ if(_run_cifg_opt)
+ {
+ if(_ones.info()->data_type() == DataType::F16)
+ {
+ std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+ }
+ else
+ {
+ std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+ }
+ NEScheduler::get().schedule(&_subtract_input_gate, Window::DimY);
+ }
+ else
+ {
+ _fully_connected_input_gate.run();
+ NEScheduler::get().schedule(&_transpose_input_gate, Window::DimY);
+ _gemm_input_gate.run();
+ NEScheduler::get().schedule(&_accum_input_gate1, Window::DimY);
+ if(_run_peephole_opt)
+ {
+ NEScheduler::get().schedule(&_pixelwise_mul_input_gate, Window::DimY);
+ _accum_input_gate2.run();
+ }
+ NEScheduler::get().schedule(&_activation_input_gate, Window::DimY);
+ }
+
+ _fully_connected_cell_state.run();
+ NEScheduler::get().schedule(&_transpose_cell_state, Window::DimY);
+ _gemm_cell_state1.run();
+ NEScheduler::get().schedule(&_accum_cell_state1, Window::DimY);
+ NEScheduler::get().schedule(&_activation_cell_state, Window::DimY);
+ NEScheduler::get().schedule(&_pixelwise_mul_cell_state1, Window::DimY);
+ NEScheduler::get().schedule(&_pixelwise_mul_cell_state2, Window::DimY);
+ NEScheduler::get().schedule(&_accum_cell_state2, Window::DimY);
+
+ if(_perform_cell_clipping)
+ {
+ NEScheduler::get().schedule(&_cell_clip, Window::DimY);
+ }
+
+ _fully_connected_output.run();
+ NEScheduler::get().schedule(&_transpose_output, Window::DimY);
+ _gemm_output.run();
+ NEScheduler::get().schedule(&_accum_output1, Window::DimY);
+
+ if(_run_peephole_opt)
+ {
+ NEScheduler::get().schedule(&_pixelwise_mul_output_state1, Window::DimY);
+ _accum_output2.run();
+ }
+ NEScheduler::get().schedule(&_activation_output, Window::DimY);
+
+ NEScheduler::get().schedule(&_activation_output_state, Window::DimY);
+ NEScheduler::get().schedule(&_pixelwise_mul_output_state2, Window::DimY);
+
+ if(_has_projection_weights)
+ {
+ _fully_connected_output_state.run();
+ if(_perform_projection_clipping)
+ {
+ NEScheduler::get().schedule(&_projection_clip, Window::DimY);
+ }
+ }
+
+ NEScheduler::get().schedule(&_copy_cell_state, Window::DimY);
+ NEScheduler::get().schedule(&_copy_output, Window::DimY);
+
+ _concat_scratch_buffer.run();
+
+ _memory_group.release();
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
new file mode 100644
index 0000000..6e7d4ab
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEPriorBoxLayerKernel>();
+ k->configure(input1, input2, output, info);
+ _kernel = std::move(k);
+}
+
+Status NEPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+{
+ return NEPriorBoxLayerKernel::validate(input1, input2, output, info);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
new file mode 100644
index 0000000..0b022df
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INNEUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY NEAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+{
+ ARM_COMPUTE_UNUSED(keep_dims);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+ for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i)
+ {
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(reduction_axis[i]) > input->num_dimensions() - 1);
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM));
+ }
+
+ return Status{};
+}
+
+void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ _reduction_ops = reduction_axis.num_dimensions();
+ _reduction_kernels = arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
+ _reduced_outs = arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
+ _keep_dims = keep_dims;
+
+ // Perform reduction for every axis
+ for(unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+ out_shape.set(reduction_axis[i], 1);
+ auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+
+ if(i == _reduction_ops - 1 && keep_dims)
+ {
+ _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM);
+ }
+ else
+ {
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type()));
+ _memory_group.manage(_reduced_outs.get() + i);
+ _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM);
+ }
+ }
+
+ // Allocate intermediate tensors
+ for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ {
+ _reduced_outs[i].allocator()->allocate();
+ }
+
+ // Configure reshape layer if we want to drop the dimensions
+ if(!keep_dims)
+ {
+ TensorShape out_shape = input->info()->tensor_shape();
+ for(unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ out_shape.remove_dimension(reduction_axis[i]);
+ }
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+ _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+ }
+}
+
+void NEReduceMean::run()
+{
+ _memory_group.acquire();
+
+ for(unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ _reduction_kernels[i].run();
+ }
+
+ if(!_keep_dims)
+ {
+ _reshape.run();
+ }
+ _memory_group.release();
+}
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index cd0b42f..188c2bb 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -26,8 +26,8 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
/** Define dimension to split the window
@@ -42,6 +42,10 @@
{
case 0:
return Window::DimY;
+ case 1:
+ case 2:
+ case 3:
+ return Window::DimX;
default:
ARM_COMPUTE_ERROR("Unsupported reduction axis");
}
@@ -59,7 +63,7 @@
} // namespace
NEReductionOperation::NEReductionOperation()
- : _reduction_kernel(), _fill_border_kernel(), _window_split(0)
+ : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis()
{
}
@@ -72,20 +76,28 @@
void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
// Configure reduction kernel
_reduction_kernel.configure(input, output, axis, op);
- _window_split = reduction_window_split_dimension(axis);
+ _window_split = reduction_window_split_dimension(axis);
+ _reduction_axis = axis;
- // Configure fill border kernel
- BorderSize fill_border_size = (axis == 0) ? _reduction_kernel.border_size() : BorderSize();
- BorderMode fill_border_mode = reduction_operation_border_mode(op);
- _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(static_cast<float>(0.f)));
+ if(axis == 0)
+ {
+ // Configure fill border kernel
+ BorderSize fill_border_size = (axis == 0) ? _reduction_kernel.border_size() : BorderSize();
+ BorderMode fill_border_mode = reduction_operation_border_mode(op);
+ _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(static_cast<float>(0.f)));
+ }
}
void NEReductionOperation::run()
{
- NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+ if(_reduction_axis == 0)
+ {
+ NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+ }
NEScheduler::get().schedule(&_reduction_kernel, _window_split);
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp
new file mode 100644
index 0000000..4ad032b
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReorgLayer.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEReorgLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEReorgLayer::configure(const ITensor *input, ITensor *output, int32_t stride)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEReorgLayerKernel>();
+ k->configure(input, output, stride);
+ _kernel = std::move(k);
+}
+
+Status NEReorgLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
+{
+ return NEReorgLayerKernel::validate(input, output, stride);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index fef4e0c..4600f36 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
+#include "arm_compute/core/Validate.h"
#include "support/ToolchainSupport.h"
#include <utility>
@@ -36,3 +37,11 @@
k->configure(input, output);
_kernel = std::move(k);
}
+
+Status NEReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, output));
+
+ return Status{};
+}
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 3a73f1e..9be9e68 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -36,9 +36,10 @@
{
}
-void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta)
+void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta, size_t axis)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_UNUSED(axis);
// Configure Kernels
_max_kernel.configure(input, &_max);
@@ -58,8 +59,10 @@
_tmp.allocator()->allocate();
}
-Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta)
+Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis must be 1 for NEON");
+
// Perform validation step
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported");
diff --git a/src/runtime/NEON/functions/NEUpsampleLayer.cpp b/src/runtime/NEON/functions/NEUpsampleLayer.cpp
new file mode 100644
index 0000000..9be96af
--- /dev/null
+++ b/src/runtime/NEON/functions/NEUpsampleLayer.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEUpsampleLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
+
+namespace arm_compute
+{
+NEUpsampleLayer::NEUpsampleLayer()
+ : _kernel(), _data_layout()
+{
+}
+
+Status NEUpsampleLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info,
+ const InterpolationPolicy &policy)
+{
+ return NEUpsampleLayerKernel::validate(input, output, info, policy);
+}
+
+void NEUpsampleLayer::configure(const ITensor *input, ITensor *output, const Size2D &info, const InterpolationPolicy &policy)
+{
+ _data_layout = input->info()->data_layout();
+ _kernel.configure(input, output, info, policy);
+}
+
+void NEUpsampleLayer::run()
+{
+ const auto win = (_data_layout == DataLayout::NCHW) ? Window::DimZ : Window::DimX;
+ NEScheduler::get().schedule(&_kernel, win);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 828a593..c8e3b3b 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -39,6 +39,121 @@
{
namespace
{
+inline Status validate_kernel_3x3(const Size2D input_dims, const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ if(input_dims.width > 4 && input_dims.height > 4)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(batched_mm_output, biases, output, winograd_info)));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(batched_mm_output, biases, output, winograd_info)));
+ }
+
+ if(act_info.enabled())
+ {
+ NEActivationLayer::validate(output, nullptr, act_info);
+ }
+ return Status{};
+}
+
+inline Status validate_kernel_5x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(batched_mm_output, biases, output, winograd_info)));
+ if(act_info.enabled())
+ {
+ NEActivationLayer::validate(output, nullptr, act_info);
+ }
+ return Status{};
+}
+
+inline Status validate_kernel_3x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 6, 1, 3>::validate(input, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 6, 1, 3>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 6, 1, 3>::validate(batched_mm_output, biases, output, winograd_info)));
+ if(act_info.enabled())
+ {
+ NEActivationLayer::validate(output, nullptr, act_info);
+ }
+ return Status{};
+}
+
+inline Status validate_kernel_1x3(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 6, 1, 3, 1>::validate(input, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 6, 1, 3, 1>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 6, 1, 3, 1>::validate(batched_mm_output, biases, output, winograd_info)));
+
+ if(act_info.enabled())
+ {
+ NEActivationLayer::validate(output, nullptr, act_info);
+ }
+ return Status{};
+}
+
+inline Status validate_kernel_5x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 4, 1, 5>::validate(input, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 4, 1, 5>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 4, 1, 5>::validate(batched_mm_output, biases, output, winograd_info)));
+ if(act_info.enabled())
+ {
+ NEActivationLayer::validate(output, nullptr, act_info);
+ }
+ return Status{};
+}
+inline Status validate_kernel_1x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 1, 5, 1>::validate(input, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 1, 5, 1>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 1, 5, 1>::validate(batched_mm_output, biases, output, winograd_info)));
+ if(act_info.enabled())
+ {
+ NEActivationLayer::validate(output, nullptr, act_info);
+ }
+ return Status{};
+}
+
+inline Status validate_kernel_7x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 2, 1, 7>::validate(input, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 2, 1, 7>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 2, 1, 7>::validate(batched_mm_output, biases, output, winograd_info)));
+ if(act_info.enabled())
+ {
+ NEActivationLayer::validate(output, nullptr, act_info);
+ }
+ return Status{};
+}
+
+inline Status validate_kernel_1x7(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 1, 7, 1>::validate(input, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 1, 7, 1>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 1, 7, 1>::validate(batched_mm_output, biases, output, winograd_info)));
+
+ if(act_info.enabled())
+ {
+ NEActivationLayer::validate(output, nullptr, act_info);
+ }
+ return Status{};
+}
+
inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
{
const DataLayout data_layout = input->info()->data_layout();
@@ -52,31 +167,19 @@
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
{
- const DataLayout data_layout = input->data_layout();
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
ARM_COMPUTE_UNUSED(output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 3 && weights->dimension(height_idx) != 5, "Only 3 and 5 kernels are supported");
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
-
if(biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
}
-
- return Status{};
+ return INEWinogradLayerTransformWeightsKernel<float>::validate(input, weights);
}
Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims)
{
Size2D output_tile = Size2D{};
-
if(kernel_dims == Size2D(3U, 3U))
{
output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
@@ -85,7 +188,30 @@
{
output_tile = Size2D(2U, 2U);
}
-
+ else if(kernel_dims == Size2D(1U, 3U))
+ {
+ output_tile = Size2D(1U, 6U);
+ }
+ else if(kernel_dims == Size2D(3U, 1U))
+ {
+ output_tile = Size2D(6U, 1U);
+ }
+ else if(kernel_dims == Size2D(1U, 5U))
+ {
+ output_tile = Size2D(1U, 4U);
+ }
+ else if(kernel_dims == Size2D(5U, 1U))
+ {
+ output_tile = Size2D(4U, 1U);
+ }
+ else if(kernel_dims == Size2D(7U, 1U))
+ {
+ output_tile = Size2D(2U, 1U);
+ }
+ else if(kernel_dims == Size2D(1U, 7U))
+ {
+ output_tile = Size2D(1U, 2U);
+ }
return output_tile;
}
@@ -94,7 +220,7 @@
// Check if we want to configure a Winograd configuration which requires fast math
using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
- std::vector<WinogradConfiguration> fast_math_winograd =
+ const std::vector<WinogradConfiguration> fast_math_winograd =
{
WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
@@ -109,7 +235,7 @@
} //namespace
NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _asm_glue(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
+ : _memory_group(memory_manager), _gemm_function(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
_permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(),
_is_prepared(false), _is_activationlayer_enabled(false)
{
@@ -149,48 +275,96 @@
int n_gemms = 0;
int N_BLOCK = 0; // Size of block used by GEMM.
- switch(kernel_size.width)
+ if(kernel_size == Size2D(3, 3))
{
- case 3:
+ if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
{
- if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
- {
- using config = NEWinogradLayerConfiguration<float, float, 4, 4, 3, 3>;
- transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else
- {
- using config = NEWinogradLayerConfiguration<float, float, 2, 2, 3, 3>;
- transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- break;
- }
- case 5:
- {
- using config = NEWinogradLayerConfiguration<float, float, 2, 2, 5, 5>;
+ using config = NEWinogradLayerConfiguration<float, float, 4, 4, 3, 3>;
transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
n_gemms = config::WinogradBase::N_GEMMS;
N_BLOCK = config::WinogradConv::N_BLOCK;
- break;
}
- default:
+ else
{
- ARM_COMPUTE_ERROR("Not supported.");
- break;
+ using config = NEWinogradLayerConfiguration<float, float, 2, 2, 3, 3>;
+ transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
}
}
+ else if(kernel_size == Size2D(5, 5))
+ {
+ using config = NEWinogradLayerConfiguration<float, float, 2, 2, 5, 5>;
+ transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else if(kernel_size == Size2D(1, 3))
+ {
+ using config = NEWinogradLayerConfiguration<float, float, 6, 1, 3, 1>;
+ transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else if(kernel_size == Size2D(3, 1))
+ {
+ using config = NEWinogradLayerConfiguration<float, float, 1, 6, 1, 3>;
+ transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else if(kernel_size == Size2D(1, 5))
+ {
+ using config = NEWinogradLayerConfiguration<float, float, 4, 1, 5, 1>;
+ transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else if(kernel_size == Size2D(5, 1))
+ {
+ using config = NEWinogradLayerConfiguration<float, float, 1, 4, 1, 5>;
+ transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else if(kernel_size == Size2D(1, 7))
+ {
+ using config = NEWinogradLayerConfiguration<float, float, 2, 1, 7, 1>;
+ transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else if(kernel_size == Size2D(7, 1))
+ {
+ using config = NEWinogradLayerConfiguration<float, float, 1, 2, 1, 7>;
+ transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Not supported.");
+ }
- const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID;
+ const PaddingType use_padding_type = (conv_info.pad_top() != 0u || conv_info.pad_left() != 0) ? PADDING_SAME : PADDING_VALID;
const bool use_same_padding = use_padding_type == PADDING_SAME;
// Get convolved dimensions
@@ -207,19 +381,19 @@
const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
in_channels)
* data_type_size
- + storage_alignment - 1;
+ + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
// Input storage
const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
use_same_padding)
* data_type_size
- + storage_alignment - 1;
+ + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
// Output storage
const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels,
use_same_padding)
* data_type_size
- + storage_alignment - 1;
+ + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
;
const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
@@ -241,6 +415,7 @@
TensorShape a_shape(k, m, 1, n_gemms);
Strides a_strides(data_type_size);
a_strides.set(1, a_strides[0] * k);
+ //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
a_strides.set(2, 0);
a_strides.set(3, data_type_size * input_matrix_stride);
@@ -252,6 +427,7 @@
TensorShape d_shape(n, m, 1, n_gemms);
Strides d_strides(data_type_size);
d_strides.set(1, data_type_size * output_matrix_row_stride);
+ //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
d_strides.set(2, 0);
d_strides.set(3, data_type_size * output_matrix_stride);
@@ -272,6 +448,8 @@
// Configure the InputTransform
_memory_group.manage(&_input_workspace);
+ _memory_group.manage(&_output_workspace);
+
if(data_layout == DataLayout::NCHW)
{
// configure the kernel to transform the input tensor from NCHW -> NHWC
@@ -279,48 +457,34 @@
_input_nhwc.allocator()->allocate();
transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
&_input_workspace, input_matrix_stride);
- }
- else
- {
- transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
- &_input_workspace, input_matrix_stride);
- }
- // Configure WeightsTransform
- if(data_layout == DataLayout::NCHW)
- {
// Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
_permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
- }
- else
- {
- // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
- _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U));
- transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
- }
- _weights_hwio.allocator()->allocate();
-
- // Configure OutputTransform
- //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
-
- _memory_group.manage(&_output_workspace);
- if(data_layout == DataLayout::NCHW)
- {
+ //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
transform_output_kernel->configure(biases, &_output_workspace,
output_matrix_stride, &_output_nhwc,
in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
}
else
{
+ transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+ &_input_workspace, input_matrix_stride);
+
+ // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+ _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U));
+
+ transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
+
transform_output_kernel->configure(biases, &_output_workspace,
output_matrix_stride, _output,
in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
}
- _asm_glue.configure(&_input_workspace, &_kernel_storage, &_output_workspace, 1.0f, 0.f, false);
+ _weights_hwio.allocator()->allocate();
+ _gemm_function.configure(&_input_workspace, &_kernel_storage, nullptr, &_output_workspace, 1.0f, 0.f);
_input_workspace.allocator()->allocate();
_kernel_storage.allocator()->allocate();
_output_workspace.allocator()->allocate();
@@ -355,12 +519,12 @@
//Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
_permute_input.run();
}
+
// Transform input tensor to the winograd domain
NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
//Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
- _asm_glue.run();
-
+ _gemm_function.run();
// Transform output tensor to the spatial domain
NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
@@ -408,97 +572,81 @@
// Validate input transform
const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape);
-
- switch(weights->dimension(idx_width))
- {
- case 3:
- {
- if(input_dims.width > 4 && input_dims.height > 4)
- {
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, &input0, winograd_info)));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, &input0, winograd_info)));
- }
- break;
- }
- case 5:
- {
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, &input0, winograd_info)));
- break;
- }
- default:
- {
- ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
- break;
- }
- }
// Validate filter transform
const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
-
- switch(weights->dimension(idx_width))
- {
- case 3:
- {
- if(input_dims.width > 4 && input_dims.height > 4)
- {
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, &input1, winograd_info)));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, &input1, winograd_info)));
- }
- break;
- }
- case 5:
- {
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, &input1, winograd_info)));
- break;
- }
- default:
- {
- ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
- break;
- }
- }
// Validate batched matrix multiply
TensorShape batched_mm_output_shape = input0.tensor_shape();
batched_mm_output_shape[0] = input1.tensor_shape()[0];
const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
- switch(weights->dimension(idx_width))
+
+ if(kernel_size == Size2D(3, 3))
{
- case 3:
- {
- if(input_dims.width > 4 && input_dims.height > 4)
- {
- // Validate output transform
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
- }
- else
- {
- // Validate output transform
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
- }
- break;
- }
- case 5:
- {
- // Validate output transform
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(&batched_mm_output, biases, output, winograd_info)));
- break;
- }
- default:
- {
- ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
- break;
- }
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
+ return validate_kernel_3x3(input_dims, input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
}
- // Validate Activation Layer
- if(act_info.enabled())
+ else if(kernel_size == Size2D(5, 5))
{
- NEActivationLayer::validate(output, nullptr, act_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
+ return validate_kernel_5x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
+ }
+ if(kernel_size == Size2D(3, 1))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
+ return validate_kernel_3x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
+ }
+ else if(kernel_size == Size2D(1, 3))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
+ return validate_kernel_1x3(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
+ }
+ else if(kernel_size == Size2D(5, 1))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
+ return validate_kernel_5x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
+ }
+ else if(kernel_size == Size2D(1, 5))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
+ return validate_kernel_1x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
+ }
+ else if(kernel_size == Size2D(7, 1))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 3, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 3, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
+ return validate_kernel_7x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
+ }
+ else if(kernel_size == Size2D(1, 7))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 3, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 3, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
+ return validate_kernel_1x7(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Kernel shape not supported");
}
return Status{};
}
@@ -513,8 +661,8 @@
// Transform weights
NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
- _weights_hwio.allocator()->free();
+ _weights_hwio.allocator()->free();
_is_prepared = true;
}
}
diff --git a/src/runtime/NEON/functions/NEYOLOLayer.cpp b/src/runtime/NEON/functions/NEYOLOLayer.cpp
new file mode 100644
index 0000000..e52d054
--- /dev/null
+++ b/src/runtime/NEON/functions/NEYOLOLayer.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEYOLOLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEYOLOLayer::configure(ITensor *input, ITensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEYOLOLayerKernel>();
+ k->configure(input, output, act_info, num_classes);
+ _kernel = std::move(k);
+}
+
+Status NEYOLOLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+ return NEYOLOLayerKernel::validate(input, output, act_info, num_classes);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
index b52ce66..c87e82a 100644
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
@@ -42,7 +43,7 @@
prepare();
_memory_group.acquire();
- NEScheduler::get().run_workloads(_workloads);
+ NEScheduler::get().run_tagged_workloads(_workloads, _tag.c_str());
_memory_group.release();
}
@@ -151,51 +152,59 @@
const unsigned int alignment = 128;
_transformed_b.allocator()->init(TensorInfo{}, alignment);
_tmp_c.allocator()->init(TensorInfo{}, alignment);
+ _tag = "NEGEMMInterleaved_";
+ _tag += get_strategy_name(input_type, use_dot);
+
if(!_pretranspose_b)
{
// If B is transposed at every iteration then transformed_B can be managed:
_memory_group.manage(&_transformed_b);
+ _block_sizes = calculate_block_sizes_from_data_type(NEScheduler::get().cpu_info(), _params.M, _params.N, _params.K, input_type, use_dot);
}
- switch(input_type)
+ else
{
- case DataType::F32:
- _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
- break;
+ _tag += "_preB";
+ switch(input_type)
+ {
+ case DataType::F32:
+ _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
+ break;
#ifdef __aarch64__
- case DataType::U8:
- case DataType::QASYMM8:
- if(use_dot)
- {
- _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
- }
- else
- {
- _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
- }
- break;
- case DataType::S8:
- if(use_dot)
- {
- _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
- }
- else
- {
- _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
- }
- break;
+ case DataType::U8:
+ case DataType::QASYMM8:
+ if(use_dot)
+ {
+ _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
+ }
+ else
+ {
+ _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
+ }
+ break;
+ case DataType::S8:
+ if(use_dot)
+ {
+ _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
+ }
+ else
+ {
+ _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
+ }
+ break;
#endif /* __aarch64__ */
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
- break;
+ case DataType::F16:
+ _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
+ break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- ARM_COMPUTE_ERROR("DataType not supported");
- break;
- }
- ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+ default:
+ ARM_COMPUTE_ERROR("DataType not supported");
+ break;
+ }
+ ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
- _block_sizes = _prepare_b->block_sizes();
+ _block_sizes = _prepare_b->block_sizes();
+ }
_block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block));
_block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block));
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index f4253c8..2355389 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -42,7 +42,6 @@
OMPScheduler::OMPScheduler() // NOLINT
: _num_threads(omp_get_max_threads())
{
- get_cpu_configuration(_cpu_info);
}
unsigned int OMPScheduler::num_threads() const
@@ -90,6 +89,7 @@
}
}
+#ifndef DOXYGEN_SKIP_THIS
void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload> &workloads)
{
const unsigned int num_threads = std::min(_num_threads, static_cast<unsigned int>(workloads.size()));
@@ -108,3 +108,4 @@
workloads[tid](info);
}
}
+#endif /* DOXYGEN_SKIP_THIS */
diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp
index 96f54f8..36eaf0b 100644
--- a/src/runtime/OffsetMemoryPool.cpp
+++ b/src/runtime/OffsetMemoryPool.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,6 +28,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/IAllocator.h"
#include "arm_compute/runtime/IMemoryPool.h"
+#include "arm_compute/runtime/MemoryRegion.h"
#include "arm_compute/runtime/Types.h"
#include "support/ToolchainSupport.h"
@@ -37,14 +38,7 @@
: _allocator(allocator), _blob(), _blob_size(blob_size)
{
ARM_COMPUTE_ERROR_ON(!allocator);
- _blob = _allocator->allocate(_blob_size, 0);
-}
-
-OffsetMemoryPool::~OffsetMemoryPool()
-{
- ARM_COMPUTE_ERROR_ON(!_allocator);
- _allocator->free(_blob);
- _blob = nullptr;
+ _blob = _allocator->make_region(blob_size, 0);
}
void OffsetMemoryPool::acquire(MemoryMappings &handles)
@@ -55,7 +49,7 @@
for(auto &handle : handles)
{
ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
- *handle.first = reinterpret_cast<uint8_t *>(_blob) + handle.second;
+ handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_size - handle.second));
}
}
@@ -64,7 +58,7 @@
for(auto &handle : handles)
{
ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
- *handle.first = nullptr;
+ handle.first->set_region(nullptr);
}
}
diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp
index 293241d..5ec2ce9 100644
--- a/src/runtime/PoolManager.cpp
+++ b/src/runtime/PoolManager.cpp
@@ -73,6 +73,36 @@
_sem = arm_compute::support::cpp14::make_unique<arm_compute::Semaphore>(_free_pools.size());
}
+std::unique_ptr<IMemoryPool> PoolManager::release_pool()
+{
+ std::lock_guard<arm_compute::Mutex> lock(_mtx);
+ ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to release one!");
+
+ if(!_free_pools.empty())
+ {
+ std::unique_ptr<IMemoryPool> pool = std::move(_free_pools.front());
+ ARM_COMPUTE_ERROR_ON(_free_pools.front() != nullptr);
+ _free_pools.pop_front();
+
+ // Update semaphore
+ _sem = arm_compute::support::cpp14::make_unique<arm_compute::Semaphore>(_free_pools.size());
+
+ return pool;
+ }
+
+ return nullptr;
+}
+
+void PoolManager::clear_pools()
+{
+ std::lock_guard<arm_compute::Mutex> lock(_mtx);
+ ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to clear the PoolManager!");
+ _free_pools.clear();
+
+ // Update semaphore
+ _sem = nullptr;
+}
+
size_t PoolManager::num_pools() const
{
std::lock_guard<arm_compute::Mutex> lock(_mtx);
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index c84a271..5fa51d7 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -127,39 +127,35 @@
uint8_t *TensorAllocator::data() const
{
- ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
- return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
+ return (_memory.region() == nullptr) ? nullptr : reinterpret_cast<uint8_t *>(_memory.region()->buffer());
}
void TensorAllocator::allocate()
{
- ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
- ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
-
if(_associated_memory_group == nullptr)
{
- _memory = Memory(std::make_shared<MemoryRegion>(info().total_size(), alignment()));
+ _memory.set_owned_region(support::cpp14::make_unique<MemoryRegion>(info().total_size(), alignment()));
}
else
{
- _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(_memory.region()->handle()), info().total_size());
- _memory.region()->set_size(info().total_size());
+ _associated_memory_group->finalize_memory(_owner, _memory, info().total_size());
}
info().set_is_resizable(false);
}
void TensorAllocator::free()
{
- _memory = Memory();
+ _memory.set_region(nullptr);
info().set_is_resizable(true);
}
-arm_compute::Status TensorAllocator::import_memory(Memory memory)
+arm_compute::Status TensorAllocator::import_memory(void *memory, size_t size)
{
- ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
- ARM_COMPUTE_RETURN_ERROR_ON(memory.region()->buffer() == nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON(memory == nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON(size == 0);
ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
- _memory = memory;
+
+ _memory.set_owned_region(support::cpp14::make_unique<MemoryRegion>(memory, info().total_size()));
info().set_is_resizable(false);
return Status{};
@@ -167,10 +163,10 @@
void TensorAllocator::set_associated_memory_group(MemoryGroup *associated_memory_group)
{
- ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
- ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region() != nullptr && _memory.region()->buffer() != nullptr);
+
_associated_memory_group = associated_memory_group;
}