arm_compute v17.09
Change-Id: I4bf8f4e6e5f84ce0d5b6f5ba570d276879f42a81
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index 8b6419c..8a7f37a 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp
@@ -194,8 +194,8 @@
PaddingSize padding;
padding.left = std::max(0, -_start_x);
padding.right = std::max<int>(0, _end_x - shape[0]);
- padding.top = shape.num_dimensions() == 1 ? 0 : std::max(0, -_start_y);
- padding.bottom = shape.num_dimensions() == 1 ? 0 : std::max<int>(0, _end_y - shape[1]);
+ padding.top = std::max(0, -_start_y);
+ padding.bottom = std::max<int>(0, _end_y - shape[1]);
// Update strides in tensor info
return _info->extend_padding(padding);
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index b3605c4..b104330 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp
@@ -66,8 +66,8 @@
// a size of the region.
// As the relation between input and output is transposed window.y() is
// used for x shape and window.x() for y shape.
- shape.set(0, std::min<int>(old_anchor[1] + old_shape[1] - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
- shape.set(1, std::min<int>(old_anchor[0] + old_shape[0] - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
+ shape.set(0, std::min<int>((old_anchor[1] + old_shape[1]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
+ shape.set(1, std::min<int>((old_anchor[0] + old_shape[0]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
// For higher dimensions use the intersection of the window size and the
// valid region of the input
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 21b72dd..821fb4c 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -27,40 +27,36 @@
#include "arm_compute/core/Types.h"
#include <map>
+#include <regex>
#include <vector>
namespace
{
-arm_compute::GPUTarget get_bifrost_target(const std::string &name)
+arm_compute::GPUTarget get_bifrost_target(const std::string &version)
{
- arm_compute::GPUTarget target = arm_compute::GPUTarget::MIDGARD;
-
- if(name == "G7")
+ if(version == "70")
{
- target = arm_compute::GPUTarget::G70;
+ return arm_compute::GPUTarget::G70;
}
-
- return target;
+ else
+ {
+ return arm_compute::GPUTarget::BIFROST;
+ }
}
-arm_compute::GPUTarget get_midgard_target(const std::string &name)
+arm_compute::GPUTarget get_midgard_target(const std::string &version)
{
- arm_compute::GPUTarget target = arm_compute::GPUTarget::MIDGARD;
-
- if(name == "T6")
+ switch(version[0])
{
- target = arm_compute::GPUTarget::T600;
+ case '6':
+ return arm_compute::GPUTarget::T600;
+ case '7':
+ return arm_compute::GPUTarget::T700;
+ case '8':
+ return arm_compute::GPUTarget::T800;
+ default:
+ return arm_compute::GPUTarget::MIDGARD;
}
- else if(name == "T7")
- {
- target = arm_compute::GPUTarget::T700;
- }
- else if(name == "T8")
- {
- target = arm_compute::GPUTarget::T800;
- }
-
- return target;
}
} // namespace
@@ -72,16 +68,22 @@
{
case DataType::U8:
return "uchar";
+ case DataType::QS8:
+ return "qs8";
case DataType::S8:
return "char";
case DataType::U16:
return "ushort";
case DataType::S16:
return "short";
+ case DataType::QS16:
+ return "qs16";
case DataType::U32:
return "uint";
case DataType::S32:
return "int";
+ case DataType::QS32:
+ return "qs32";
case DataType::U64:
return "ulong";
case DataType::S64:
@@ -96,6 +98,47 @@
}
}
+std::string get_data_size_from_data_type(const DataType &dt)
+{
+ switch(dt)
+ {
+ case DataType::U8:
+ case DataType::QS8:
+ case DataType::S8:
+ return "8";
+ case DataType::U16:
+ case DataType::S16:
+ case DataType::QS16:
+ case DataType::F16:
+ return "16";
+ case DataType::U32:
+ case DataType::S32:
+ case DataType::F32:
+ return "32";
+ case DataType::U64:
+ case DataType::S64:
+ return "64";
+ default:
+ ARM_COMPUTE_ERROR("Unsupported input data type.");
+ return "0";
+ }
+}
+
+std::string get_underlying_cl_type_from_data_type(const DataType &dt)
+{
+ switch(dt)
+ {
+ case DataType::QS8:
+ return "char";
+ case DataType::QS16:
+ return "short";
+ case DataType::QS32:
+ return "int";
+ default:
+ return get_cl_type_from_data_type(dt);
+ }
+}
+
const std::string &string_from_target(GPUTarget target)
{
static std::map<GPUTarget, const std::string> gpu_target_map =
@@ -113,53 +156,104 @@
GPUTarget get_target_from_device(cl::Device &device)
{
- const std::string name_mali("Mali-");
- GPUTarget target{ GPUTarget::MIDGARD };
-
- size_t name_size = 0;
- std::vector<char> name;
+ size_t name_size = 0;
// Query device name size
cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, 0, nullptr, &name_size);
ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (name_size == 0), "clGetDeviceInfo failed to return valid information");
- // Resize vector
- name.resize(name_size);
+ ARM_COMPUTE_UNUSED(err);
+
+ std::vector<char> name_buffer(name_size);
+
// Query device name
- err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, name_size, name.data(), nullptr);
+ err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, name_size, name_buffer.data(), nullptr);
ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
ARM_COMPUTE_UNUSED(err);
- std::string name_str(name.begin(), name.end());
- auto pos = name_str.find(name_mali);
+ std::regex mali_regex(R"(Mali-([TG])(\d+))");
+ std::string device_name(name_buffer.begin(), name_buffer.end());
+ std::smatch name_parts;
+ const bool found_mali = std::regex_search(device_name, name_parts, mali_regex);
- if(pos != std::string::npos)
+ if(!found_mali)
{
- ARM_COMPUTE_ERROR_ON_MSG((pos + name_mali.size() + 2) > name_str.size(), "Device name is shorter than expected.");
- std::string sub_name = name_str.substr(pos + name_mali.size(), 2);
+ ARM_COMPUTE_INFO("Can't find valid Mali GPU. Target is set to MIDGARD.");
+ return GPUTarget::MIDGARD;
+ }
- if(sub_name[0] == 'G')
- {
- target = get_bifrost_target(sub_name);
- }
- else if(sub_name[0] == 'T')
- {
- target = get_midgard_target(sub_name);
- }
- else
- {
+ const char target = name_parts.str(1)[0];
+ const std::string &version = name_parts.str(2);
+
+ switch(target)
+ {
+ case 'T':
+ return get_midgard_target(version);
+ case 'G':
+ return get_bifrost_target(version);
+ default:
ARM_COMPUTE_INFO("Mali GPU unknown. Target is set to the default one.");
- }
+ return GPUTarget::MIDGARD;
}
- else
- {
- ARM_COMPUTE_INFO("Can't find valid Mali GPU. Target is set to the default one.");
- }
-
- return target;
}
GPUTarget get_arch_from_target(GPUTarget target)
{
return (target & GPUTarget::GPU_ARCH_MASK);
}
+
+bool non_uniform_workgroup_support(const cl::Device &device)
+{
+ std::vector<char> extension;
+ size_t extension_size = 0;
+ cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_EXTENSIONS, 0, nullptr, &extension_size);
+ ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (extension_size == 0), "clGetDeviceInfo failed to return valid information");
+ ARM_COMPUTE_UNUSED(err);
+ // Resize vector
+ extension.resize(extension_size);
+ // Query extension
+ err = clGetDeviceInfo(device.get(), CL_DEVICE_EXTENSIONS, extension_size, extension.data(), nullptr);
+ ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
+ ARM_COMPUTE_UNUSED(err);
+
+ std::string extension_str(extension.begin(), extension.end());
+ auto pos = extension_str.find("cl_arm_non_uniform_work_group_size");
+ return (pos != std::string::npos);
+}
+
+CLVersion get_cl_version(const cl::Device &device)
+{
+ std::vector<char> version;
+ size_t version_size = 0;
+ cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_VERSION, 0, nullptr, &version_size);
+ ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (version_size == 0), "clGetDeviceInfo failed to return valid information");
+ ARM_COMPUTE_UNUSED(err);
+
+ // Resize vector
+ version.resize(version_size);
+ // Query version
+ err = clGetDeviceInfo(device.get(), CL_DEVICE_VERSION, version_size, version.data(), nullptr);
+ ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
+ ARM_COMPUTE_UNUSED(err);
+
+ std::string version_str(version.begin(), version.end());
+ if(version_str.find("OpenCL 2") != std::string::npos)
+ {
+ return CLVersion::CL20;
+ }
+ else if(version_str.find("OpenCL 1.2") != std::string::npos)
+ {
+ return CLVersion::CL12;
+ }
+ else if(version_str.find("OpenCL 1.1") != std::string::npos)
+ {
+ return CLVersion::CL11;
+ }
+ else if(version_str.find("OpenCL 1.0") != std::string::npos)
+ {
+ return CLVersion::CL10;
+ }
+
+ return CLVersion::UNKNOWN;
+}
+
} // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 15a5d90..e165cf3 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -23,9 +23,11 @@
*/
#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Utils.h"
+#include <algorithm>
#include <fstream>
#include <iostream>
#include <utility>
@@ -142,32 +144,49 @@
{ "copy_plane", "channel_extract.cl" },
{ "copy_planes_3p", "channel_combine.cl" },
{ "copy_to_keypoint", "fast_corners.cl" },
+ { "depthwise_convolution_3x3", "depthwise_convolution.cl" },
+ { "depthwise_im2col", "depthwise_convolution.cl" },
+ { "depthwise_vector_to_tensor", "depthwise_convolution.cl" },
+ { "depthwise_weights_reshape", "depthwise_convolution.cl" },
+ { "dequantization_layer", "dequantization_layer.cl" },
{ "derivative", "derivative.cl" },
{ "dilate", "dilate.cl" },
+ { "direct_convolution1x1", "direct_convolution1x1.cl" },
+ { "direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl" },
+ { "direct_convolution3x3", "direct_convolution3x3.cl" },
+ { "direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl" },
+ { "direct_convolution5x5", "direct_convolution5x5.cl" },
+ { "direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl" },
{ "erode", "erode.cl" },
{ "fast_corners", "fast_corners.cl" },
{ "fill_image_borders_constant", "fill_border.cl" },
{ "fill_image_borders_replicate", "fill_border.cl" },
{ "finalize", "optical_flow_pyramid_lk.cl" },
+ { "floor_layer", "floor.cl" },
{ "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
{ "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
- { "gemm_accumulate_biases_f16", "gemm.cl" },
- { "gemm_accumulate_biases_f32", "gemm.cl" },
+ { "gemm_accumulate_biases", "gemm.cl" },
{ "gemm_interleave4x4_8bit", "gemm.cl" },
{ "gemm_interleave4x4_16bit", "gemm.cl" },
{ "gemm_interleave4x4_32bit", "gemm.cl" },
{ "gemm_ma_f16", "gemm.cl" },
{ "gemm_ma_f32", "gemm.cl" },
- { "gemm_mm_u8", "gemm.cl" },
- { "gemm_mm_f16", "gemm.cl" },
- { "gemm_mm_f32_midgard", "gemm.cl" },
- { "gemm_mm_f32_bifrost", "gemm.cl" },
- { "gemm_vm_f16", "gemm.cl" },
- { "gemm_vm_f32", "gemm.cl" },
+ { "gemm_ma_qs8", "gemm.cl" },
+ { "gemm_ma_qs16", "gemm.cl" },
+ { "gemm_mv", "gemv.cl" },
+ { "gemm_mm_interleaved_transposed_u8", "gemm.cl" },
+ { "gemm_mm_interleaved_transposed_f16", "gemm.cl" },
+ { "gemm_mm_interleaved_transposed_f32_midgard", "gemm.cl" },
+ { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" },
+ { "gemm_mm_interleaved_transposed_qs8", "gemm.cl" },
+ { "gemm_mm_interleaved_transposed_qs16", "gemm.cl" },
+ { "gemm_mm_floating_point", "gemm.cl" },
+ { "gemm_mm_qs8", "gemm.cl" },
+ { "gemm_mm_qs16", "gemm.cl" },
{ "gemm_lc_vm_f32", "gemm.cl" },
- { "gemm_transpose1x16_u8", "gemm.cl" },
- { "gemm_transpose1x8_f16", "gemm.cl" },
- { "gemm_transpose1x4_f32", "gemm.cl" },
+ { "gemm_transpose1x16", "gemm.cl" },
+ { "gemm_transpose1x8", "gemm.cl" },
+ { "gemm_transpose1x4", "gemm.cl" },
{ "harris_score_3x3", "harris_corners.cl" },
{ "harris_score_5x5", "harris_corners.cl" },
{ "harris_score_7x7", "harris_corners.cl" },
@@ -180,6 +199,7 @@
{ "hog_orientation_binning", "hog.cl" },
{ "hysteresis", "canny.cl" },
{ "im2col_generic", "convolution_layer.cl" },
+ { "im2col_kernel3x3_padx0_pady0", "convolution_layer.cl" },
{ "im2col_reduced", "convolution_layer.cl" },
{ "init_level", "optical_flow_pyramid_lk.cl" },
{ "init_level_max", "optical_flow_pyramid_lk.cl" },
@@ -190,12 +210,14 @@
{ "IYUV_to_RGB888_bt709", "color_convert.cl" },
{ "IYUV_to_RGBA8888_bt709", "color_convert.cl" },
{ "IYUV_to_YUV444_bt709", "color_convert.cl" },
+ { "l2_normalize", "l2_normalize.cl" },
{ "lktracker_stage0", "optical_flow_pyramid_lk.cl" },
{ "lktracker_stage1", "optical_flow_pyramid_lk.cl" },
{ "magnitude_phase", "magnitude_phase.cl" },
{ "mean_stddev_accumulate", "mean_stddev.cl" },
{ "minmax", "minmaxloc.cl" },
{ "minmax_border", "minmaxloc.cl" },
+ { "minmax_layer", "minmax_layer.cl" },
{ "minmaxloc", "minmaxloc.cl" },
{ "non_linear_filter_box3x3", "non_linear_filter3x3.cl" },
{ "non_linear_filter_cross3x3", "non_linear_filter3x3.cl" },
@@ -219,8 +241,14 @@
{ "pixelwise_mul_int", "pixelwise_mul_int.cl" },
{ "pooling_layer_2", "pooling_layer.cl" },
{ "pooling_layer_3", "pooling_layer.cl" },
+ { "pooling_layer_3_optimized", "pooling_layer.cl" },
+ { "pooling_layer_7", "pooling_layer.cl" },
+ { "pooling_layer_N", "pooling_layer.cl" },
+ { "quantization_layer", "quantization_layer.cl" },
+ { "reduction_operation", "reduction_operation.cl" },
{ "remap_nearest_neighbour", "remap.cl" },
{ "remap_bilinear", "remap.cl" },
+ { "reshape_layer", "reshape_layer.cl" },
{ "reshape_to_columns", "convolution_layer.cl" },
{ "RGB888_to_IYUV_bt709", "color_convert.cl" },
{ "RGB888_to_NV12_bt709", "color_convert.cl" },
@@ -230,6 +258,7 @@
{ "RGBA8888_to_NV12_bt709", "color_convert.cl" },
{ "RGBA8888_to_RGB888_bt709", "color_convert.cl" },
{ "RGBA8888_to_YUV444_bt709", "color_convert.cl" },
+ { "roi_pooling_layer", "roi_pooling_layer.cl" },
{ "scale_nearest_neighbour", "scale.cl" },
{ "scale_bilinear", "scale.cl" },
{ "scharr3x3", "scharr_filter.cl" },
@@ -333,6 +362,14 @@
#include "./cl_kernels/depth_convert.clembed"
},
{
+ "depthwise_convolution.cl",
+#include "./cl_kernels/depthwise_convolution.clembed"
+ },
+ {
+ "dequantization_layer.cl",
+#include "./cl_kernels/dequantization_layer.clembed"
+ },
+ {
"derivative.cl",
#include "./cl_kernels/derivative.clembed"
},
@@ -341,6 +378,18 @@
#include "./cl_kernels/dilate.clembed"
},
{
+ "direct_convolution1x1.cl",
+#include "./cl_kernels/direct_convolution1x1.clembed"
+ },
+ {
+ "direct_convolution3x3.cl",
+#include "./cl_kernels/direct_convolution3x3.clembed"
+ },
+ {
+ "direct_convolution5x5.cl",
+#include "./cl_kernels/direct_convolution5x5.clembed"
+ },
+ {
"erode.cl",
#include "./cl_kernels/erode.clembed"
},
@@ -353,6 +402,14 @@
#include "./cl_kernels/fill_border.clembed"
},
{
+ "fixed_point.h",
+#include "./cl_kernels/fixed_point.hembed"
+ },
+ {
+ "floor.cl",
+#include "./cl_kernels/floor.clembed"
+ },
+ {
"gaussian_pyramid.cl",
#include "./cl_kernels/gaussian_pyramid.clembed"
},
@@ -361,6 +418,10 @@
#include "./cl_kernels/gemm.clembed"
},
{
+ "gemv.cl",
+#include "./cl_kernels/gemv.clembed"
+ },
+ {
"harris_corners.cl",
#include "./cl_kernels/harris_corners.clembed"
},
@@ -381,6 +442,10 @@
#include "./cl_kernels/integral_image.clembed"
},
{
+ "l2_normalize.cl",
+#include "./cl_kernels/l2_normalize.clembed"
+ },
+ {
"magnitude_phase.cl",
#include "./cl_kernels/magnitude_phase.clembed"
},
@@ -393,6 +458,10 @@
#include "./cl_kernels/minmaxloc.clembed"
},
{
+ "minmax_layer.cl",
+#include "./cl_kernels/minmax_layer.clembed"
+ },
+ {
"non_linear_filter3x3.cl",
#include "./cl_kernels/non_linear_filter3x3.clembed"
},
@@ -433,10 +502,26 @@
#include "./cl_kernels/pooling_layer.clembed"
},
{
+ "quantization_layer.cl",
+#include "./cl_kernels/quantization_layer.clembed"
+ },
+ {
+ "reduction_operation.cl",
+#include "./cl_kernels/reduction_operation.clembed"
+ },
+ {
"remap.cl",
#include "./cl_kernels/remap.clembed"
},
{
+ "reshape_layer.cl",
+#include "./cl_kernels/reshape_layer.clembed"
+ },
+ {
+ "roi_pooling_layer.cl",
+#include "./cl_kernels/roi_pooling_layer.clembed"
+ },
+ {
"scale.cl",
#include "./cl_kernels/scale.clembed"
},
@@ -479,12 +564,12 @@
{
"warp_perspective.cl",
#include "./cl_kernels/warp_perspective.clembed"
- }
-#endif
+ },
+#endif /* EMBEDDED_KERNELS */
};
CLKernelLibrary::CLKernelLibrary()
- : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
+ : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map(), _max_workgroup_size(0)
{
}
@@ -504,9 +589,25 @@
ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
}
+ std::string concat_str;
+
+ if(non_uniform_workgroup_support(_device))
+ {
+ concat_str += " -cl-arm-non-uniform-work-group-size ";
+ }
+ else if(get_cl_version(_device) == CLVersion::CL20)
+ {
+ concat_str += " -cl-std=CL2.0 ";
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
+ }
+
// Check if the program has been built before with same build options.
- const std::string program_name = kernel_program_it->second;
- const std::string build_options = stringify_set(build_options_set);
+ const std::string program_name = kernel_program_it->second;
+ const std::string build_options = stringify_set(build_options_set) + concat_str;
+
const std::string built_program_name = program_name + "_" + build_options;
auto built_program_it = _built_programs_map.find(built_program_name);
@@ -553,7 +654,7 @@
}
program = Program(_context, program_name, program_source_it->second);
-#else
+#else /* EMBEDDED_KERNELS */
// Check for binary
std::string source_name = _kernel_path + program_name;
std::string binary_name = source_name + "bin";
@@ -571,7 +672,7 @@
{
ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str());
}
-#endif
+#endif /* EMBEDDED_KERNELS */
// Insert program to program map
const auto new_program = _programs_map.emplace(program_name, std::move(program));
@@ -581,7 +682,7 @@
std::string CLKernelLibrary::stringify_set(const StringSet &s) const
{
- std::string concat_set = "-cl-arm-non-uniform-work-group-size ";
+ std::string concat_set;
#ifndef EMBEDDED_KERNELS
concat_set += "-I" + _kernel_path + " ";
@@ -595,3 +696,32 @@
return concat_set;
}
+
+std::string CLKernelLibrary::get_program_source(const std::string &program_name)
+{
+ const auto program_source_it = _program_source_map.find(program_name);
+
+ if(program_source_it == _program_source_map.end())
+ {
+ ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+ }
+
+ return program_source_it->second;
+}
+
+size_t CLKernelLibrary::max_local_workgroup_size()
+{
+ if(_max_workgroup_size == 0)
+ {
+ size_t err = clGetDeviceInfo(_device.get(), CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &_max_workgroup_size, nullptr);
+ ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
+ ARM_COMPUTE_UNUSED(err);
+ }
+
+ return _max_workgroup_size;
+}
+
+cl::NDRange CLKernelLibrary::default_ndrange()
+{
+ return cl::NDRange(std::min<size_t>(_max_workgroup_size, 128u), 1);
+}
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 7ac0fe3..1e04f00 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -31,7 +31,6 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
#include <cstddef>
@@ -44,7 +43,10 @@
return;
}
- ARM_COMPUTE_ERROR_ON((0 == (window.x().end() - window.x().start())) || (0 == (window.y().end() - window.y().start())));
+ if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
+ {
+ return;
+ }
cl::NDRange gws((window.x().end() - window.x().start()) / window.x().step(),
(window.y().end() - window.y().start()) / window.y().step(),
@@ -61,7 +63,7 @@
}
ICLKernel::ICLKernel()
- : _kernel(nullptr), _lws_hint(cl::Range_128_1), _target(CLScheduler::get().target())
+ : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id)
{
}
@@ -71,12 +73,6 @@
}
template <unsigned int dimension_size>
-unsigned int ICLKernel::num_arguments_per_tensor() const
-{
- return 2 + 2 * dimension_size;
-}
-
-template <unsigned int dimension_size>
void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window)
{
ARM_COMPUTE_ERROR_ON(tensor == nullptr);
@@ -123,6 +119,16 @@
add_tensor_argument<3>(idx, tensor, window);
}
+void ICLKernel::add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+{
+ add_tensor_argument<4>(idx, tensor, window);
+}
+
+unsigned int ICLKernel::num_arguments_per_1D_array() const
+{
+ return num_arguments_per_array<1>();
+}
+
unsigned int ICLKernel::num_arguments_per_1D_tensor() const
{
return num_arguments_per_tensor<1>();
@@ -138,6 +144,11 @@
return num_arguments_per_tensor<3>();
}
+unsigned int ICLKernel::num_arguments_per_4D_tensor() const
+{
+ return num_arguments_per_tensor<4>();
+}
+
void ICLKernel::set_target(cl::Device &device)
{
_target = get_target_from_device(device);
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 3b8dfd2..1d04f39 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -27,114 +27,94 @@
#include <dlfcn.h>
#include <iostream>
-using clBuildProgram_func = cl_int (*)(cl_program, cl_uint, const cl_device_id *, const char *, void (*pfn_notify)(cl_program, void *), void *);
-using clEnqueueNDRangeKernel_func = cl_int (*)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
-using clSetKernelArg_func = cl_int (*)(cl_kernel, cl_uint, size_t, const void *);
-using clReleaseMemObject_func = cl_int (*)(cl_mem);
-using clEnqueueUnmapMemObject_func = cl_int (*)(cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *);
-using clRetainCommandQueue_func = cl_int (*)(cl_command_queue command_queue);
-using clReleaseContext_func = cl_int (*)(cl_context);
-using clReleaseEvent_func = cl_int (*)(cl_event);
-using clEnqueueWriteBuffer_func = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
-using clEnqueueReadBuffer_func = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
-using clGetProgramBuildInfo_func = cl_int (*)(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *);
-using clRetainProgram_func = cl_int (*)(cl_program program);
-using clEnqueueMapBuffer_func = void *(*)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *);
-using clReleaseCommandQueue_func = cl_int (*)(cl_command_queue);
-using clCreateProgramWithBinary_func = cl_program (*)(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *);
-using clRetainContext_func = cl_int (*)(cl_context context);
-using clReleaseProgram_func = cl_int (*)(cl_program program);
-using clFlush_func = cl_int (*)(cl_command_queue command_queue);
-using clGetProgramInfo_func = cl_int (*)(cl_program, cl_program_info, size_t, void *, size_t *);
-using clCreateKernel_func = cl_kernel (*)(cl_program, const char *, cl_int *);
-using clRetainKernel_func = cl_int (*)(cl_kernel kernel);
-using clCreateBuffer_func = cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
-using clCreateProgramWithSource_func = cl_program (*)(cl_context, cl_uint, const char **, const size_t *, cl_int *);
-using clReleaseKernel_func = cl_int (*)(cl_kernel kernel);
-using clGetDeviceInfo_func = cl_int (*)(cl_device_id, cl_device_info, size_t, void *, size_t *);
-using clGetDeviceIDs_func = cl_int (*)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
-
-class CLSymbols
+namespace arm_compute
{
-private:
- CLSymbols()
+CLSymbols &CLSymbols::get()
+{
+ static CLSymbols symbols;
+ return symbols;
+}
+
+bool CLSymbols::load_default()
+{
+ static const std::vector<std::string> libraries{ "libOpenCL.so", "libGLES_mali.so", "libmali.so" };
+
+ if(_loaded.first)
{
- void *handle = dlopen("libOpenCL.so", RTLD_LAZY | RTLD_LOCAL);
- if(handle == nullptr)
+ return _loaded.second;
+ }
+
+ // Indicate that default loading has been tried
+ _loaded.first = true;
+
+ for(const auto &lib : libraries)
+ {
+ if(load(lib))
{
- std::cerr << "Can't load libOpenCL.so: " << dlerror() << std::endl;
- }
- else
- {
- clBuildProgram = reinterpret_cast<clBuildProgram_func>(dlsym(handle, "clBuildProgram"));
- clEnqueueNDRangeKernel = reinterpret_cast<clEnqueueNDRangeKernel_func>(dlsym(handle, "clEnqueueNDRangeKernel"));
- clSetKernelArg = reinterpret_cast<clSetKernelArg_func>(dlsym(handle, "clSetKernelArg"));
- clReleaseKernel = reinterpret_cast<clReleaseKernel_func>(dlsym(handle, "clReleaseKernel"));
- clCreateProgramWithSource = reinterpret_cast<clCreateProgramWithSource_func>(dlsym(handle, "clCreateProgramWithSource"));
- clCreateBuffer = reinterpret_cast<clCreateBuffer_func>(dlsym(handle, "clCreateBuffer"));
- clRetainKernel = reinterpret_cast<clRetainKernel_func>(dlsym(handle, "clRetainKernel"));
- clCreateKernel = reinterpret_cast<clCreateKernel_func>(dlsym(handle, "clCreateKernel"));
- clGetProgramInfo = reinterpret_cast<clGetProgramInfo_func>(dlsym(handle, "clGetProgramInfo"));
- clFlush = reinterpret_cast<clFlush_func>(dlsym(handle, "clFlush"));
- clReleaseProgram = reinterpret_cast<clReleaseProgram_func>(dlsym(handle, "clReleaseProgram"));
- clRetainContext = reinterpret_cast<clRetainContext_func>(dlsym(handle, "clRetainContext"));
- clCreateProgramWithBinary = reinterpret_cast<clCreateProgramWithBinary_func>(dlsym(handle, "clCreateProgramWithBinary"));
- clReleaseCommandQueue = reinterpret_cast<clReleaseCommandQueue_func>(dlsym(handle, "clReleaseCommandQueue"));
- clEnqueueMapBuffer = reinterpret_cast<clEnqueueMapBuffer_func>(dlsym(handle, "clEnqueueMapBuffer"));
- clRetainProgram = reinterpret_cast<clRetainProgram_func>(dlsym(handle, "clRetainProgram"));
- clGetProgramBuildInfo = reinterpret_cast<clGetProgramBuildInfo_func>(dlsym(handle, "clGetProgramBuildInfo"));
- clEnqueueReadBuffer = reinterpret_cast<clEnqueueReadBuffer_func>(dlsym(handle, "clEnqueueReadBuffer"));
- clEnqueueWriteBuffer = reinterpret_cast<clEnqueueWriteBuffer_func>(dlsym(handle, "clEnqueueWriteBuffer"));
- clReleaseEvent = reinterpret_cast<clReleaseEvent_func>(dlsym(handle, "clReleaseEvent"));
- clReleaseContext = reinterpret_cast<clReleaseContext_func>(dlsym(handle, "clReleaseContext"));
- clRetainCommandQueue = reinterpret_cast<clRetainCommandQueue_func>(dlsym(handle, "clRetainCommandQueue"));
- clEnqueueUnmapMemObject = reinterpret_cast<clEnqueueUnmapMemObject_func>(dlsym(handle, "clEnqueueUnmapMemObject"));
- clReleaseMemObject = reinterpret_cast<clReleaseMemObject_func>(dlsym(handle, "clReleaseMemObject"));
- clGetDeviceInfo = reinterpret_cast<clGetDeviceInfo_func>(dlsym(handle, "clGetDeviceInfo"));
- clGetDeviceIDs = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs"));
- dlclose(handle);
+ return true;
}
}
-public:
- static CLSymbols &get()
+ std::cerr << "Couldn't find any OpenCL library.\n";
+ return false;
+}
+
+bool CLSymbols::load(const std::string &library)
+{
+ void *handle = dlopen(library.c_str(), RTLD_LAZY | RTLD_LOCAL);
+
+ if(handle == nullptr)
{
- static CLSymbols symbols = CLSymbols();
- return symbols;
+ std::cerr << "Can't load " << library << ": " << dlerror() << "\n";
+ // Set status of loading to failed
+ _loaded.second = false;
+ return false;
}
- clBuildProgram_func clBuildProgram = nullptr;
- clEnqueueNDRangeKernel_func clEnqueueNDRangeKernel = nullptr;
- clSetKernelArg_func clSetKernelArg = nullptr;
- clReleaseKernel_func clReleaseKernel = nullptr;
- clCreateProgramWithSource_func clCreateProgramWithSource = nullptr;
- clCreateBuffer_func clCreateBuffer = nullptr;
- clRetainKernel_func clRetainKernel = nullptr;
- clCreateKernel_func clCreateKernel = nullptr;
- clGetProgramInfo_func clGetProgramInfo = nullptr;
- clFlush_func clFlush = nullptr;
- clReleaseProgram_func clReleaseProgram = nullptr;
- clRetainContext_func clRetainContext = nullptr;
- clCreateProgramWithBinary_func clCreateProgramWithBinary = nullptr;
- clReleaseCommandQueue_func clReleaseCommandQueue = nullptr;
- clEnqueueMapBuffer_func clEnqueueMapBuffer = nullptr;
- clRetainProgram_func clRetainProgram = nullptr;
- clGetProgramBuildInfo_func clGetProgramBuildInfo = nullptr;
- clEnqueueReadBuffer_func clEnqueueReadBuffer = nullptr;
- clEnqueueWriteBuffer_func clEnqueueWriteBuffer = nullptr;
- clReleaseEvent_func clReleaseEvent = nullptr;
- clReleaseContext_func clReleaseContext = nullptr;
- clRetainCommandQueue_func clRetainCommandQueue = nullptr;
- clEnqueueUnmapMemObject_func clEnqueueUnmapMemObject = nullptr;
- clReleaseMemObject_func clReleaseMemObject = nullptr;
- clGetDeviceInfo_func clGetDeviceInfo = nullptr;
- clGetDeviceIDs_func clGetDeviceIDs = nullptr;
-};
+ clBuildProgram = reinterpret_cast<clBuildProgram_func>(dlsym(handle, "clBuildProgram"));
+ clEnqueueNDRangeKernel = reinterpret_cast<clEnqueueNDRangeKernel_func>(dlsym(handle, "clEnqueueNDRangeKernel"));
+ clSetKernelArg = reinterpret_cast<clSetKernelArg_func>(dlsym(handle, "clSetKernelArg"));
+ clReleaseKernel = reinterpret_cast<clReleaseKernel_func>(dlsym(handle, "clReleaseKernel"));
+ clCreateProgramWithSource = reinterpret_cast<clCreateProgramWithSource_func>(dlsym(handle, "clCreateProgramWithSource"));
+ clCreateBuffer = reinterpret_cast<clCreateBuffer_func>(dlsym(handle, "clCreateBuffer"));
+ clRetainKernel = reinterpret_cast<clRetainKernel_func>(dlsym(handle, "clRetainKernel"));
+ clCreateKernel = reinterpret_cast<clCreateKernel_func>(dlsym(handle, "clCreateKernel"));
+ clGetProgramInfo = reinterpret_cast<clGetProgramInfo_func>(dlsym(handle, "clGetProgramInfo"));
+ clFlush = reinterpret_cast<clFlush_func>(dlsym(handle, "clFlush"));
+ clFinish = reinterpret_cast<clFinish_func>(dlsym(handle, "clFinish"));
+ clReleaseProgram = reinterpret_cast<clReleaseProgram_func>(dlsym(handle, "clReleaseProgram"));
+ clRetainContext = reinterpret_cast<clRetainContext_func>(dlsym(handle, "clRetainContext"));
+ clCreateProgramWithBinary = reinterpret_cast<clCreateProgramWithBinary_func>(dlsym(handle, "clCreateProgramWithBinary"));
+ clReleaseCommandQueue = reinterpret_cast<clReleaseCommandQueue_func>(dlsym(handle, "clReleaseCommandQueue"));
+ clEnqueueMapBuffer = reinterpret_cast<clEnqueueMapBuffer_func>(dlsym(handle, "clEnqueueMapBuffer"));
+ clRetainProgram = reinterpret_cast<clRetainProgram_func>(dlsym(handle, "clRetainProgram"));
+ clGetProgramBuildInfo = reinterpret_cast<clGetProgramBuildInfo_func>(dlsym(handle, "clGetProgramBuildInfo"));
+ clEnqueueReadBuffer = reinterpret_cast<clEnqueueReadBuffer_func>(dlsym(handle, "clEnqueueReadBuffer"));
+ clEnqueueWriteBuffer = reinterpret_cast<clEnqueueWriteBuffer_func>(dlsym(handle, "clEnqueueWriteBuffer"));
+ clReleaseEvent = reinterpret_cast<clReleaseEvent_func>(dlsym(handle, "clReleaseEvent"));
+ clReleaseContext = reinterpret_cast<clReleaseContext_func>(dlsym(handle, "clReleaseContext"));
+ clRetainCommandQueue = reinterpret_cast<clRetainCommandQueue_func>(dlsym(handle, "clRetainCommandQueue"));
+ clEnqueueUnmapMemObject = reinterpret_cast<clEnqueueUnmapMemObject_func>(dlsym(handle, "clEnqueueUnmapMemObject"));
+ clRetainMemObject = reinterpret_cast<clRetainMemObject_func>(dlsym(handle, "clRetainMemObject"));
+ clReleaseMemObject = reinterpret_cast<clReleaseMemObject_func>(dlsym(handle, "clReleaseMemObject"));
+ clGetDeviceInfo = reinterpret_cast<clGetDeviceInfo_func>(dlsym(handle, "clGetDeviceInfo"));
+ clGetDeviceIDs = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs"));
+ clRetainEvent = reinterpret_cast<clRetainEvent_func>(dlsym(handle, "clRetainEvent"));
-bool arm_compute::opencl_is_available()
+ dlclose(handle);
+
+ // Disable default loading and set status to successful
+ _loaded = std::make_pair(true, true);
+
+ return true;
+}
+
+bool opencl_is_available()
{
+ CLSymbols::get().load_default();
return CLSymbols::get().clBuildProgram != nullptr;
}
+} // namespace arm_compute
cl_int clBuildProgram(
cl_program program,
@@ -144,7 +124,8 @@
void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
void *user_data)
{
- auto func = CLSymbols::get().clBuildProgram;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clBuildProgram;
if(func != nullptr)
{
return func(program, num_devices, device_list, options, pfn_notify, user_data);
@@ -166,7 +147,8 @@
const cl_event *event_wait_list,
cl_event *event)
{
- auto func = CLSymbols::get().clEnqueueNDRangeKernel;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel;
if(func != nullptr)
{
return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
@@ -183,7 +165,8 @@
size_t arg_size,
const void *arg_value)
{
- auto func = CLSymbols::get().clSetKernelArg;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clSetKernelArg;
if(func != nullptr)
{
return func(kernel, arg_index, arg_size, arg_value);
@@ -194,9 +177,24 @@
}
}
+cl_int clRetainMemObject(cl_mem memobj)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clRetainMemObject;
+ if(func != nullptr)
+ {
+ return func(memobj);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
+
cl_int clReleaseMemObject(cl_mem memobj)
{
- auto func = CLSymbols::get().clReleaseMemObject;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clReleaseMemObject;
if(func != nullptr)
{
return func(memobj);
@@ -215,7 +213,8 @@
const cl_event *event_wait_list,
cl_event *event)
{
- auto func = CLSymbols::get().clEnqueueUnmapMemObject;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clEnqueueUnmapMemObject;
if(func != nullptr)
{
return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
@@ -228,7 +227,8 @@
cl_int clRetainCommandQueue(cl_command_queue command_queue)
{
- auto func = CLSymbols::get().clRetainCommandQueue;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clRetainCommandQueue;
if(func != nullptr)
{
return func(command_queue);
@@ -241,7 +241,8 @@
cl_int clReleaseContext(cl_context context)
{
- auto func = CLSymbols::get().clReleaseContext;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clReleaseContext;
if(func != nullptr)
{
return func(context);
@@ -253,7 +254,8 @@
}
cl_int clReleaseEvent(cl_event event)
{
- auto func = CLSymbols::get().clReleaseEvent;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clReleaseEvent;
if(func != nullptr)
{
return func(event);
@@ -275,7 +277,8 @@
const cl_event *event_wait_list,
cl_event *event)
{
- auto func = CLSymbols::get().clEnqueueWriteBuffer;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clEnqueueWriteBuffer;
if(func != nullptr)
{
return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
@@ -297,7 +300,8 @@
const cl_event *event_wait_list,
cl_event *event)
{
- auto func = CLSymbols::get().clEnqueueReadBuffer;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clEnqueueReadBuffer;
if(func != nullptr)
{
return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
@@ -316,7 +320,8 @@
void *param_value,
size_t *param_value_size_ret)
{
- auto func = CLSymbols::get().clGetProgramBuildInfo;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clGetProgramBuildInfo;
if(func != nullptr)
{
return func(program, device, param_name, param_value_size, param_value, param_value_size_ret);
@@ -329,7 +334,8 @@
cl_int clRetainProgram(cl_program program)
{
- auto func = CLSymbols::get().clRetainProgram;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clRetainProgram;
if(func != nullptr)
{
return func(program);
@@ -352,7 +358,8 @@
cl_event *event,
cl_int *errcode_ret)
{
- auto func = CLSymbols::get().clEnqueueMapBuffer;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clEnqueueMapBuffer;
if(func != nullptr)
{
return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
@@ -369,7 +376,8 @@
cl_int clReleaseCommandQueue(cl_command_queue command_queue)
{
- auto func = CLSymbols::get().clReleaseCommandQueue;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clReleaseCommandQueue;
if(func != nullptr)
{
return func(command_queue);
@@ -389,7 +397,8 @@
cl_int *binary_status,
cl_int *errcode_ret)
{
- auto func = CLSymbols::get().clCreateProgramWithBinary;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clCreateProgramWithBinary;
if(func != nullptr)
{
return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret);
@@ -406,7 +415,8 @@
cl_int clRetainContext(cl_context context)
{
- auto func = CLSymbols::get().clRetainContext;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clRetainContext;
if(func != nullptr)
{
return func(context);
@@ -419,7 +429,8 @@
cl_int clReleaseProgram(cl_program program)
{
- auto func = CLSymbols::get().clReleaseProgram;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clReleaseProgram;
if(func != nullptr)
{
return func(program);
@@ -432,7 +443,22 @@
cl_int clFlush(cl_command_queue command_queue)
{
- auto func = CLSymbols::get().clFlush;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clFlush;
+ if(func != nullptr)
+ {
+ return func(command_queue);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
+
+cl_int clFinish(cl_command_queue command_queue)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clFinish;
if(func != nullptr)
{
return func(command_queue);
@@ -450,7 +476,8 @@
void *param_value,
size_t *param_value_size_ret)
{
- auto func = CLSymbols::get().clGetProgramInfo;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clGetProgramInfo;
if(func != nullptr)
{
return func(program, param_name, param_value_size, param_value, param_value_size_ret);
@@ -466,7 +493,8 @@
const char *kernel_name,
cl_int *errcode_ret)
{
- auto func = CLSymbols::get().clCreateKernel;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clCreateKernel;
if(func != nullptr)
{
return func(program, kernel_name, errcode_ret);
@@ -483,7 +511,8 @@
cl_int clRetainKernel(cl_kernel kernel)
{
- auto func = CLSymbols::get().clRetainKernel;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clRetainKernel;
if(func != nullptr)
{
return func(kernel);
@@ -501,7 +530,8 @@
void *host_ptr,
cl_int *errcode_ret)
{
- auto func = CLSymbols::get().clCreateBuffer;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clCreateBuffer;
if(func != nullptr)
{
return func(context, flags, size, host_ptr, errcode_ret);
@@ -523,7 +553,8 @@
const size_t *lengths,
cl_int *errcode_ret)
{
- auto func = CLSymbols::get().clCreateProgramWithSource;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clCreateProgramWithSource;
if(func != nullptr)
{
return func(context, count, strings, lengths, errcode_ret);
@@ -540,7 +571,8 @@
cl_int clReleaseKernel(cl_kernel kernel)
{
- auto func = CLSymbols::get().clReleaseKernel;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clReleaseKernel;
if(func != nullptr)
{
return func(kernel);
@@ -557,7 +589,8 @@
cl_device_id *devices,
cl_uint *num_devices)
{
- auto func = CLSymbols::get().clGetDeviceIDs;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clGetDeviceIDs;
if(func != nullptr)
{
return func(platform, device_type, num_entries, devices, num_devices);
@@ -574,7 +607,8 @@
void *param_value,
size_t *param_value_size_ret)
{
- auto func = CLSymbols::get().clGetDeviceInfo;
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clGetDeviceInfo;
if(func != nullptr)
{
return func(device, param_name, param_value_size, param_value, param_value_size_ret);
@@ -584,3 +618,17 @@
return CL_OUT_OF_RESOURCES;
}
}
+
+cl_int clRetainEvent(cl_event event)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clRetainEvent;
+ if(func != nullptr)
+ {
+ return func(event);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
index e3cbb6c..4424a66 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/activation_layer.cl
@@ -23,14 +23,109 @@
*/
#include "helpers.h"
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+
+#define CONST_ONE (1 << FIXED_POINT_POSITION)
+#define ABS_OP(a) ABS_SAT_OP_EXPAND((a), DATA_TYPE, VEC_SIZE)
+#define ADD_OP(a, b) ADD_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE)
+#define SUB_OP(a, b) SUB_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE)
+#define MUL_OP(a, b) MUL_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define MLA_OP(a, b, c) MLA_SAT_OP_EXPAND((a), (b), (c), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define DIV_OP(a, b) DIV_SAT_OP_VEC_EXPAND((a), (b), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define EXP_OP(a) EXP_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define LOG_OP(a) LOG_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define SQRT_OP(a) DIV_OP(CONST_ONE, INVSQRT_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION))
+#define TANH_OP(a) TANH_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+
+#else /* FIXED_POINT_POSITION */
+
+#define CONST_ONE 1.f
+#define ABS_OP(a) fabs((a))
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define MLA_OP(a, b, c) ((b) * (c) + (a))
+#define DIV_OP(a, b) ((a) / (b))
+#define EXP_OP(a) exp((a))
+#define LOG_OP(a) log((a))
+#define SQRT_OP(a) sqrt((a))
+#define TANH_OP(a) tanh((a))
+
+#endif /* FIXED_POINT_POSITION */
+
+// Logistic Activation
+inline TYPE logistic_op(TYPE x)
+{
+ return DIV_OP((TYPE)CONST_ONE, ADD_OP((TYPE)CONST_ONE, EXP_OP(-x)));
+}
+// Hyperbolic Tangent Activation
+inline TYPE tanh_op(TYPE x)
+{
+ return MUL_OP((TYPE)A_VAL, TANH_OP(MUL_OP((TYPE)B_VAL, x)));
+}
+// RELU Tangent Activation
+inline TYPE relu_op(TYPE x)
+{
+ return max(0, x);
+}
+// Bounded RELU Activation
+inline TYPE brelu_op(TYPE x)
+{
+ return min((TYPE)A_VAL, max(0, x));
+}
+// Lower Upper Bounded RELU Activation
+inline TYPE lu_brelu_op(TYPE x)
+{
+ return min(max(x, (TYPE)B_VAL), (TYPE)A_VAL);
+}
+// Leaky RELU Activation
+inline TYPE lrelu_op(TYPE x)
+{
+ return select(MUL_OP((TYPE)A_VAL, x), x, x > (TYPE)0);
+}
+// Soft RELU Activation
+inline TYPE srelu_op(TYPE x)
+{
+ return LOG_OP(ADD_OP((TYPE)CONST_ONE, EXP_OP(x)));
+}
+// Absolute Activation
+inline TYPE abs_op(TYPE x)
+{
+ return ABS_OP(x);
+}
+// Square Activation
+inline TYPE square_op(TYPE x)
+{
+ return MUL_OP(x, x);
+}
+// Square-root Activation
+inline TYPE sqrt_op(TYPE x)
+{
+ return SQRT_OP(x);
+}
+// Linear Activation
+inline TYPE linear_op(TYPE x)
+{
+ return MLA_OP((TYPE)B_VAL, (TYPE)A_VAL, x);
+}
+
+#define ACTIVATION_OP2(op, x) op##_op(x)
+#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+
/** This performs an activation function floating point inputs.
*
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Activation function should be given as a preprocessor argument using -DNAME. e.g. -DTANH
- * @note Distinction between floating point and integer is done using -DTYPE_FP and -DTYPE_INT preprocessor argument
- * @note A, B variables required by some activation functions are set using -DA= and -DB= respectively.
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: F16, F32
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note In case of fixed point calculations the fixed point position is passed using -DFIXED_POINT_POSITION=position. e.g. -DFIXED_POINT_POSITION=3.
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
@@ -38,7 +133,7 @@
* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
@@ -48,42 +143,28 @@
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
*/
__kernel void activation_layer(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
+ TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+ ,
+ TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
{
// Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D output = input;
+#else /* IN_PLACE */
Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
// Load data
- VEC_DATA_TYPE(DATA_TYPE, 16)
- data = vload16(0, (__global DATA_TYPE *)input.ptr);
+ TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
// Perform activation
-#if defined LOGISTIC
- data = 1 / (1 + exp(-data));
-#elif defined TANH
- data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * tanh((VEC_DATA_TYPE(DATA_TYPE, 16))B * data);
-#elif defined RELU
- data = max(0, data);
-#elif defined BRELU
- data = min((VEC_DATA_TYPE(DATA_TYPE, 16))A, max(0, data));
-#elif defined SRELU
- data = log(1 + exp(data));
-#elif defined ABS
-#if defined TYPE_INT
- data = abs(data);
-#else
- data = fabs(data);
-#endif
-#elif defined SQUARE
- data = data * data;
-#elif defined SQRT
- data = sqrt(data);
-#elif defined LINEAR
- data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * data + (VEC_DATA_TYPE(DATA_TYPE, 16))B;
-#endif
+ data = ACTIVATION_OP(ACT, data);
// Store result
- vstore16(data, 0, (__global DATA_TYPE *)output.ptr);
+ VSTORE(VEC_SIZE)
+ (data, 0, (__global DATA_TYPE *)output.ptr);
}
diff --git a/src/core/CL/cl_kernels/arithmetic_op.cl b/src/core/CL/cl_kernels/arithmetic_op.cl
index 434300e..0341410 100644
--- a/src/core/CL/cl_kernels/arithmetic_op.cl
+++ b/src/core/CL/cl_kernels/arithmetic_op.cl
@@ -23,13 +23,17 @@
*/
#include "helpers.h"
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+#endif /* FIXED_POINT_POSITION */
+
#ifdef SATURATE
#define ADD(x, y) add_sat((x), (y))
#define SUB(x, y) sub_sat((x), (y))
-#else
+#else /* SATURATE */
#define ADD(x, y) (x) + (y)
#define SUB(x, y) (x) - (y)
-#endif
+#endif /* SATURATE */
/** This function add two images.
*
@@ -37,19 +41,19 @@
* e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
* @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
*
- * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16
+ * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/QS8/QS16/S16/F16/F32
* @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16
+ * @param[in] in2_ptr Pointer to the source image. Supported data types: U8/QS8 (only if @p in1_ptr is QS8), QS16 (only if @p in1_ptr is QS16), S16/F16/F32
* @param[in] in2_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8 (only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32
* @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 13e6702..b7423d8 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -23,9 +23,28 @@
*/
#include "helpers.h"
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+
+#define ADD_OP(a, b) ADD_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE)
+#define SUB_OP(a, b) SUB_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE)
+#define MUL_OP(a, b) MUL_SAT_OP_EXPAND((a), (b), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define INVSQRT_OP(a) INVSQRT_OP_EXPAND((a), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define SQCVT_SAT(a) SQCVT_SAT_OP_EXPAND((a), DATA_TYPE, FIXED_POINT_POSITION)
+
+#else /* FIXED_POINT_POSITION */
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#endif /* FIXED_POINT_POSITION */
+
/** Apply batch normalization.
*
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F32
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: QS8/QS16/F32
* @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
@@ -33,7 +52,7 @@
* @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
* @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
@@ -41,59 +60,72 @@
* @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: F32
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr
* @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
* @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in] var_ptr Pointer to the var tensor. Supported data types: F32
+ * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr
* @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
* @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
- * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: F32
+ * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr
* @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
* @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
- * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: F32
+ * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
* @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
* @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
* @param[in] epsilon Epsilon parameter in the batch normalization equation
*/
__kernel void batchnormalization_layer(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
VECTOR_DECLARATION(mean),
VECTOR_DECLARATION(var),
VECTOR_DECLARATION(beta),
VECTOR_DECLARATION(gamma),
float epsilon)
{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
- Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
- Vector var = CONVERT_TO_VECTOR_STRUCT(var);
- Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
- Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D out = in;
+#else /* IN_PLACE */
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector var = CONVERT_TO_VECTOR_STRUCT(var);
+ Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+ Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
- float4 _in = 0;
- float4 denominator = 0;
- float4 numerator = 0;
- float4 x_bar = 0;
- float4 gamma_vec = 0;
- float4 beta_vec = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ _in = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ denominator = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ numerator = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ x_bar = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ gamma_vec = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_vec = 0;
const int current_slice = get_global_id(2);
- _in = vload4(0, (__global float *)in.ptr);
- denominator = *((__global float *)(var.ptr + current_slice * var.stride_x));
- denominator = rsqrt(denominator + epsilon);
+ _in = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+ denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
+ denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(epsilon)));
// Calculate x bar and store results
- numerator = *((__global float *)(mean.ptr + current_slice * mean.stride_x));
- numerator = _in - numerator;
- x_bar = numerator * denominator;
+ numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
+ numerator = SUB_OP(_in, numerator);
+ x_bar = MUL_OP(numerator, denominator);
- gamma_vec = *((__global float *)(gamma.ptr + current_slice * beta.stride_x));
- beta_vec = *((__global float *)(beta.ptr + current_slice * beta.stride_x));
+ gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * beta.stride_x));
+ beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
- vstore4(gamma_vec * x_bar + beta_vec, 0, (__global float *)out.ptr);
+ VSTORE(VEC_SIZE)
+ (ADD_OP(MUL_OP(gamma_vec, x_bar), beta_vec), 0, (__global DATA_TYPE *)out.ptr);
}
diff --git a/src/core/CL/cl_kernels/channel_combine.cl b/src/core/CL/cl_kernels/channel_combine.cl
index 93e80b9..d309812 100644
--- a/src/core/CL/cl_kernels/channel_combine.cl
+++ b/src/core/CL/cl_kernels/channel_combine.cl
@@ -337,11 +337,11 @@
uchar8 data1 = vload8(0, src_plane1.ptr);
uchar8 data2 = vload8(0, src_plane2.ptr);
-#if defined NV12
+#ifdef NV12
vstore16(shuffle2(data1, data2, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr);
-#elif defined NV21
+#elif defined(NV21)
vstore16(shuffle2(data2, data1, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr);
-#endif
+#endif /* NV12 or NV21 */
}
/** This function combines three planes to a single YUV444 or IYUV image.
@@ -405,12 +405,12 @@
// Copy plane data
vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
-#if defined YUV444
+#ifdef YUV444
vstore16(vload16(0, src_plane1.ptr), 0, dst_plane1.ptr);
vstore16(vload16(0, src_plane2.ptr), 0, dst_plane2.ptr);
-#elif defined IYUV
+#elif defined(IYUV)
vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
vstore8(vload8(0, src_plane1.ptr), 0, dst_plane1.ptr);
vstore8(vload8(0, src_plane2.ptr), 0, dst_plane2.ptr);
-#endif
+#endif /* YUV444 or IYUV */
}
diff --git a/src/core/CL/cl_kernels/channel_extract.cl b/src/core/CL/cl_kernels/channel_extract.cl
index 14c6c8a9..e95bda4 100644
--- a/src/core/CL/cl_kernels/channel_extract.cl
+++ b/src/core/CL/cl_kernels/channel_extract.cl
@@ -51,16 +51,16 @@
uchar16 data = vload16(0, src.ptr);
uchar8 data2 = vload8(0, src.ptr + 16);
-#if defined CHANNEL_R
+#ifdef CHANNEL_R
vstore4(data.s0369, 0, dst.ptr);
vstore4((uchar4)(data.sCF, data2.s25), 0, dst.ptr + 4);
-#elif defined CHANNEL_G
+#elif defined(CHANNEL_G)
vstore4(data.s147A, 0, dst.ptr);
vstore4((uchar4)(data.sD, data2.s036), 0, dst.ptr + 4);
-#elif defined CHANNEL_B
+#elif defined(CHANNEL_B)
vstore4(data.s258B, 0, dst.ptr);
vstore4((uchar4)(data.sE, data2.s147), 0, dst.ptr + 4);
-#endif
+#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B */
}
/** This function extracts a given channel from an RGBA image.
@@ -91,15 +91,15 @@
uchar16 data = vload16(0, src.ptr);
uchar16 data2 = vload16(0, src.ptr + 16);
-#if defined CHANNEL_R
+#ifdef CHANNEL_R
vstore8((uchar8)(data.s048C, data2.s048C), 0, dst.ptr);
-#elif defined CHANNEL_G
+#elif defined(CHANNEL_G)
vstore8((uchar8)(data.s159D, data2.s159D), 0, dst.ptr);
-#elif defined CHANNEL_B
+#elif defined(CHANNEL_B)
vstore8((uchar8)(data.s26AE, data2.s26AE), 0, dst.ptr);
-#elif defined CHANNEL_A
+#elif defined(CHANNEL_A)
vstore8((uchar8)(data.s37BF, data2.s37BF), 0, dst.ptr);
-#endif
+#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B or CHANNEL_A */
}
/** This function extracts a given channel from an YUYV image.
@@ -129,13 +129,13 @@
uchar16 data = vload16(0, src.ptr);
-#if defined CHANNEL_Y
+#ifdef CHANNEL_Y
vstore8(data.s02468ACE, 0, dst.ptr);
-#elif defined CHANNEL_U
+#elif defined(CHANNEL_U)
vstore4(data.s159D, 0, dst.ptr);
-#elif defined CHANNEL_V
+#elif defined(CHANNEL_V)
vstore4(data.s37BF, 0, dst.ptr);
-#endif
+#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */
}
/** This function extracts a given channel from an UYUV image.
@@ -165,13 +165,13 @@
uchar16 data = vload16(0, src.ptr);
-#if defined CHANNEL_Y
+#ifdef CHANNEL_Y
vstore8(data.s13579BDF, 0, dst.ptr);
-#elif defined CHANNEL_U
+#elif defined(CHANNEL_U)
vstore4(data.s048C, 0, dst.ptr);
-#elif defined CHANNEL_V
+#elif defined(CHANNEL_V)
vstore4(data.s26AE, 0, dst.ptr);
-#endif
+#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */
}
/** This function extracts a given channel from an NV12 image.
@@ -202,11 +202,11 @@
uchar16 data = vload16(0, src.ptr);
-#if defined CHANNEL_U
+#ifdef CHANNEL_U
vstore8(data.s02468ACE, 0, dst.ptr);
-#elif defined CHANNEL_V
+#elif defined(CHANNEL_V)
vstore8(data.s13579BDF, 0, dst.ptr);
-#endif
+#endif /* CHANNEL_U or CHANNEL_V */
}
/** This function extracts a given channel from an NV21 image.
@@ -237,11 +237,11 @@
uchar16 data = vload16(0, src.ptr);
-#if defined CHANNEL_U
+#ifdef CHANNEL_U
vstore8(data.s13579BDF, 0, dst.ptr);
-#elif defined CHANNEL_V
+#elif defined(CHANNEL_V)
vstore8(data.s02468ACE, 0, dst.ptr);
-#endif
+#endif /* CHANNEL_U or CHANNEL_V */
}
/** This function extracts a given plane from an multi-planar image.
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index 00f5189..a92ab5b 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -25,29 +25,35 @@
/** This kernel concatenates the input tensor into the output tensor along the third dimension
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8, QS16, F16, F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: F32
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] offset The offset to the first valid element of the output tensor in bytes
+ * @param[in] offsets The offsets to the first valid element of the output tensor in bytes
*/
__kernel void concatenate_depth(
- IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst),
- unsigned int offset)
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ int3 offsets)
{
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
- float4 source_values = vload4(0, (__global float *)src.ptr);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, -offsets.x, -offsets.y, 0));
- vstore4(source_values, 0, (__global float *)(dst.ptr + offset));
+ VSTORE(VEC_SIZE)
+ (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offsets.z));
}
diff --git a/src/core/CL/cl_kernels/convolution3x3.cl b/src/core/CL/cl_kernels/convolution3x3.cl
index 3733d0c..8c75ecd 100644
--- a/src/core/CL/cl_kernels/convolution3x3.cl
+++ b/src/core/CL/cl_kernels/convolution3x3.cl
@@ -25,11 +25,11 @@
#ifndef DATA_TYPE
#define DATA_TYPE short
-#endif
+#endif /* DATA_TYPE */
#ifndef DATA_TYPE_OUT
#define DATA_TYPE_OUT uchar
-#endif
+#endif /* DATA_TYPE_OUT */
/** Compute a 1D horizontal convolution of size 3 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
*
diff --git a/src/core/CL/cl_kernels/convolution5x5.cl b/src/core/CL/cl_kernels/convolution5x5.cl
index d1335c5..605cd09 100644
--- a/src/core/CL/cl_kernels/convolution5x5.cl
+++ b/src/core/CL/cl_kernels/convolution5x5.cl
@@ -25,15 +25,15 @@
#ifndef DATA_TYPE
#define DATA_TYPE short
-#endif
+#endif /* DATA_TYPE */
#ifndef COMPUTE_TYPE
#define COMPUTE_TYPE int
-#endif
+#endif /* COMPUTE_TYPE */
#ifndef DATA_TYPE_OUT
#define DATA_TYPE_OUT uchar
-#endif
+#endif /* DATA_TYPE_OUT */
/** Compute a 1D horizontal convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
*
diff --git a/src/core/CL/cl_kernels/convolution7x7.cl b/src/core/CL/cl_kernels/convolution7x7.cl
index 74a0055..1abfb15 100644
--- a/src/core/CL/cl_kernels/convolution7x7.cl
+++ b/src/core/CL/cl_kernels/convolution7x7.cl
@@ -25,15 +25,15 @@
#ifndef DATA_TYPE
#define DATA_TYPE short
-#endif
+#endif /* DATA_TYPE */
#ifndef COMPUTE_TYPE
#define COMPUTE_TYPE int
-#endif
+#endif /* COMPUTE_TYPE */
#ifndef DATA_TYPE_OUT
#define DATA_TYPE_OUT uchar
-#endif
+#endif /* DATA_TYPE_OUT */
/** Compute a 1D horizontal convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
*
diff --git a/src/core/CL/cl_kernels/convolution9x9.cl b/src/core/CL/cl_kernels/convolution9x9.cl
index d8b07ca..f537326 100644
--- a/src/core/CL/cl_kernels/convolution9x9.cl
+++ b/src/core/CL/cl_kernels/convolution9x9.cl
@@ -25,15 +25,15 @@
#ifndef DATA_TYPE
#define DATA_TYPE short
-#endif
+#endif /* DATA_TYPE */
#ifndef COMPUTE_TYPE
#define COMPUTE_TYPE int
-#endif
+#endif /* COMPUTE_TYPE */
#ifndef DATA_TYPE_OUT
#define DATA_TYPE_OUT uchar
-#endif
+#endif /* DATA_TYPE_OUT */
/** Compute a 1D horizontal convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
*
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
index bd5dfaf..9e9d0b0 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/convolution_layer.cl
@@ -23,11 +23,15 @@
*/
#include "helpers.h"
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+#endif // FIXED_POINT_POSITION
+
/** This kernel reshapes the tensor's low three dimensions to single column
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -35,13 +39,13 @@
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Same as input
+ * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] bias_ptr Pointer to the bias tensor. Same as input
+ * @param[in] bias_ptr Pointer to the bias tensor. Same as @p src_ptr
* @param[in] bias_stride_x Stride of the bias tensor in X dimension (in bytes)
* @param[in] bias_step_x bias_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] bias_offset_first_element_in_bytes The offset of the first element in the source tensor
@@ -53,9 +57,9 @@
__kernel void reshape_to_columns(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
-#if defined HAS_BIAS
+#ifdef HAS_BIAS
VECTOR_DECLARATION(bias),
-#endif
+#endif /* HAS_BIAS */
uint width, uint height, uint depth, uint total_filters)
{
Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
@@ -64,9 +68,9 @@
__global uchar *tmp_src_ptr = src.ptr;
__global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y + get_global_id(1) * width * dst_stride_y + get_global_id(
2) * width * height * dst_stride_y;
-#if defined HAS_BIAS
+#ifdef HAS_BIAS
__global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
-#endif
+#endif /* HAS_BIAS */
if(is_last_thread)
{
@@ -74,10 +78,10 @@
{
*((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
-#if defined HAS_BIAS
+#ifdef HAS_BIAS
*((__global DATA_TYPE *)(tmp_dst_ptr + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
tmp_bias_ptr += bias_stride_x;
-#endif
+#endif /* HAS_BIAS */
tmp_src_ptr += depth * src_stride_z;
tmp_dst_ptr += dst_stride_x;
}
@@ -93,12 +97,13 @@
}
}
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -106,75 +111,156 @@
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] kernel_size The convolution kernel size
- * @param[in] kernel_depth The kernel depth
- * @param[in] width The output tensor width
- * @param[in] input_dims The input tensor dimensions
- * @param[in] strides The strides of the im2col operation
- * @param[in] paddings The input tensor paddings
+ * @param[in] filter_depth The depth of the used filter
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
__kernel void im2col_generic(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
- int kernel_size,
- int kernel_depth,
- int width,
- int2 input_dims,
- int2 strides,
- int2 paddings)
+ uint filter_depth,
+ uint src_stride_w,
+ uint dst_stride_w)
{
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Image dst = CONVERT_TO_IMAGE_STRUCT_NO_STEP(dst);
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % filter_depth; // input feature map
+ const int batch = get_global_id(2) / filter_depth; // the batch
- // Determine output index
- uint idx = (get_global_id(1) * width + get_global_id(0)) * dst.stride_y;
- __global uchar *output_ptr = dst.ptr + idx;
+ // Calculate input indeces
+ const int xi = xc * STRIDE_X - PAD_X;
+ const int yi = yc * STRIDE_Y - PAD_Y;
- // Determine current input index
- const int top_left_x = get_global_id(0) * strides.x - paddings.x;
- const int top_left_y = get_global_id(1) * strides.y - paddings.y;
+ // Calculate output indeces
+ const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+ const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+ __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
// Linearize convolution elements
- for(int d = 0; d < kernel_depth; ++d)
+ for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
{
- for(int y = top_left_y, y_e = top_left_y + kernel_size; y < y_e; ++y)
+ for(int x = xi, x_e = xi + KERNEL_WIDTH; x < x_e; ++x, ++output_ptr)
{
- for(int x = top_left_x, x_e = top_left_x + kernel_size; x < x_e; ++x, output_ptr += dst.stride_x)
+#if PAD_X == 0 && PAD_Y == 0
+ *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+#else // PAD_X == 0 && PAD_Y == 0
+ if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
{
- if(x < 0 || x >= input_dims.x || y < 0 || y >= input_dims.y)
- {
- *((__global DATA_TYPE *)output_ptr) = 0;
- }
- else
- {
- *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)(tensor3D_offset(&src, x, y, d)));
- }
+ *output_ptr = 0;
}
+ else
+ {
+ *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+ }
+#endif // PAD_X == 0 && PAD_Y == 0
}
}
-#if defined HAS_BIAS
- *((__global DATA_TYPE *)output_ptr) = 1;
-#endif
+#ifdef HAS_BIAS
+ if(ch == (KERNEL_DEPTH - 1))
+ {
+#ifdef FIXED_POINT_POSITION
+ *output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
+#else // FIXED_POINT_POSITION
+ *output_ptr = 1.0f;
+#endif // FIXED_POINT_POSITION
+ }
+#endif // HAS_BIAS
}
-/** This kernel performs a reshaping of the output of the convolution layer.
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 3x3 and pad_x = pad_y = 0
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] filter_depth The depth of the used filter
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_kernel3x3_padx0_pady0(
+ TENSOR3D_DECLARATION(src),
+ IMAGE_DECLARATION(dst),
+ uint filter_depth,
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % filter_depth; // input feature map
+ const int batch = get_global_id(2) / filter_depth; // the batch
+
+ // Calculate input indeces
+ const int xi = xc * STRIDE_X;
+ const int yi = yc * STRIDE_Y;
+
+ // Calculate output indeces
+ const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+ const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
+
+ __global DATA_TYPE *output_ptr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w) + xo;
+
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+
+ vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, output_ptr);
+ *(output_ptr + 8) = row2.s2;
+
+#ifdef HAS_BIAS
+ if(ch == (KERNEL_DEPTH - 1))
+ {
+#ifdef FIXED_POINT_POSITION
+ *(output_ptr + 9) = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
+#else // FIXED_POINT_POSITION
+ *(output_ptr + 9) = 1.0f;
+#endif // FIXED_POINT_POSITION
+ }
+#endif // HAS_BIAS
+}
+#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+
+#if defined(WIDTH_OUTPUT)
+/** This kernel performs a reshaping of the output of the convolution layer.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
@@ -182,27 +268,30 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] width The output tensor width
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
*/
__kernel void col2im(
- IMAGE_DECLARATION(src),
+ TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(dst),
- uint width)
+ uint dst_stride_w)
{
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
- int idx = get_global_id(0) * dst.stride_z + (get_global_id(1) / width) * dst.stride_y + (get_global_id(1) % width) * dst.stride_x;
- __global uchar *tmp_out_ptr = dst.ptr + idx;
- *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)(src.ptr));
+ // Compute output offset
+ int idx = get_global_id(0) * dst.stride_z + (get_global_id(1) / WIDTH_OUTPUT) * dst_stride_y + (get_global_id(1) % WIDTH_OUTPUT) * dst_stride_x + get_global_id(2) * dst_stride_w;
+
+ // Store value
+ *((__global DATA_TYPE *)(dst.ptr + idx)) = *((__global DATA_TYPE *)(src.ptr));
}
+#endif // defined(WIDTH_OUTPUT)
/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
* @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -210,7 +299,7 @@
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Same as input.
+ * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
@@ -230,12 +319,16 @@
*((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr);
-#if defined HAS_BIAS
+#ifdef HAS_BIAS
// If it is the last thread in the 3 dimensional workgroup
if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1))
{
tmp_out_ptr += dst_stride_x;
+#ifdef FIXED_POINT_POSITION
+ *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
+#else // FIXED_POINT_POSITION
*((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)1;
+#endif // FIXED_POINT_POSITION
}
-#endif
+#endif // HAS_BIAS
}
diff --git a/src/core/CL/cl_kernels/convolution_rectangle.cl b/src/core/CL/cl_kernels/convolution_rectangle.cl
index 96b9cff..f5a109f 100644
--- a/src/core/CL/cl_kernels/convolution_rectangle.cl
+++ b/src/core/CL/cl_kernels/convolution_rectangle.cl
@@ -31,15 +31,15 @@
#ifndef DATA_TYPE
#define DATA_TYPE short
-#endif
+#endif /* DATA_TYPE */
#ifndef COMPUTE_TYPE
#define COMPUTE_TYPE int
-#endif
+#endif /* COMPUTE_TYPE */
#ifndef DATA_TYPE_OUT
#define DATA_TYPE_OUT uchar
-#endif
+#endif /* DATA_TYPE_OUT */
#ifndef DYNAMIC_MATRIX_CONVOLUTION
@@ -89,24 +89,24 @@
#if MATRIX_WIDTH == 3
pixels += convolution1x3(offset(&src, -1, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 3], matrix_coeff[1 + i * 3],
matrix_coeff[2 + i * 3]);
-#endif
+#endif /* MATRIX_WIDTH */
#if MATRIX_WIDTH == 5
pixels += convolution1x5(offset(&src, -2, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 5], matrix_coeff[1 + i * 5],
matrix_coeff[2 + i * 5], matrix_coeff[3 + i * 5], matrix_coeff[4 + i * 5]);
-#endif
+#endif /* MATRIX_WIDTH */
#if MATRIX_WIDTH == 7
pixels += convolution1x7(offset(&src, -3, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 7], matrix_coeff[1 + i * 7],
matrix_coeff[2 + i * 7], matrix_coeff[3 + i * 7], matrix_coeff[4 + i * 7],
matrix_coeff[5 + i * 7], matrix_coeff[6 + i * 7]);
-#endif
+#endif /* MATRIX_WIDTH */
#if MATRIX_WIDTH == 9
pixels += convolution1x9(offset(&src, -4, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 9], matrix_coeff[1 + i * 9],
matrix_coeff[2 + i * 9], matrix_coeff[3 + i * 9], matrix_coeff[4 + i * 9],
matrix_coeff[5 + i * 9], matrix_coeff[6 + i * 9], matrix_coeff[7 + i * 9], matrix_coeff[8 + i * 9]);
-#endif
+#endif /* MATRIX_WIDTH */
}
pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))SCALE;
@@ -115,4 +115,4 @@
vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, ((__global DATA_TYPE_OUT *)dst.ptr));
}
-#endif // DYNAMIC_MATRIX_CONVOLUTION
+#endif /* not DYNAMIC_MATRIX_CONVOLUTION */
diff --git a/src/core/CL/cl_kernels/depth_convert.cl b/src/core/CL/cl_kernels/depth_convert.cl
index c8eaa95..a9b7284 100644
--- a/src/core/CL/cl_kernels/depth_convert.cl
+++ b/src/core/CL/cl_kernels/depth_convert.cl
@@ -23,24 +23,47 @@
*/
#include "helpers.h"
+#if defined(FIXED_POINT_POSITION)
+
+#include "fixed_point.h"
+
+#ifdef SATURATE
+#define CONVERT_DOWN(x, in_type, out_type, fixed_point_position) CONVERT_DOWN1_SAT(x, in_type, out_type, fixed_point_position)
+#define CONVERT_DOWN1_SAT(x, in_type, out_type, fixed_point_position) convert_##out_type##_##in_type##_sat(x, fixed_point_position)
+#else /* SATURATE */
+#define CONVERT_DOWN(x, in_type, out_type, fixed_point_position) CONVERT_DOWN1(x, in_type, out_type, fixed_point_position)
+#define CONVERT_DOWN1(x, in_type, out_type, fixed_point_position) convert_##out_type##_##in_type(x, fixed_point_position)
+#endif /* SATURATE */
+
+#define CONVERT_UP(x, in_type, out_type, fixed_point_position) CONVERT_UP1(x, in_type, out_type, fixed_point_position)
+#define CONVERT_UP1(x, in_type, out_type, fixed_point_position) convert_##out_type##_##in_type(x, fixed_point_position)
+
+#else /* FIXED_POINT_POSITION */
+
#ifdef SATURATE
#define CONVERT_DOWN(x, type) CONVERT_SAT(x, type)
-#else
+#else /* SATURATE */
#define CONVERT_DOWN(x, type) CONVERT(x, type)
-#endif
+#endif /* SATURATE */
+
+#define CONVERT_UP(x, type) CONVERT(x, type)
+
+#endif /* FIXED_POINT_POSITION */
/** This function performs a down-scaling depth conversion.
*
* @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
* e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
*
- * @param[in] in_ptr Pointer to the source image. Supported data types: U8, U16, S16, U32 or S32
+ * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F16, F32
* @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in_step_x in_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] in_step_y in_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: QS8, U8, QS16, U16, S16, U32, S32
* @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
@@ -60,7 +83,12 @@
// Load data
VEC_DATA_TYPE(DATA_TYPE_IN, 16)
in_data = vload16(0, (__global DATA_TYPE_IN *)in.ptr);
+
+#if defined(FIXED_POINT_POSITION)
+ vstore16(CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_IN, 16), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), FIXED_POINT_POSITION), 0, (__global DATA_TYPE_OUT *)out.ptr);
+#else /* FIXED_POINT_POSITION */
vstore16(CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+#endif /* FIXED_POINT_POSITION */
}
/** This function performs a up-scaling depth conversion.
@@ -68,13 +96,15 @@
* @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
* e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
*
- * @param[in] in_ptr Pointer to the source image. Supported data types: U8, U16, S16, U32 or S32
+ * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: U8, QS8, U16, S16, QS16, U32 or S32
* @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in_step_x in_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] in_step_y in_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, U16, S16, U32, S32, F16 or F32
* @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
@@ -92,7 +122,12 @@
Image out = CONVERT_TO_IMAGE_STRUCT(out);
// Load data
- VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
- in_data = CONVERT(vload16(0, (__global DATA_TYPE_IN *)in.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
- vstore16(in_data << shift, 0, (__global DATA_TYPE_OUT *)out.ptr);
+ VEC_DATA_TYPE(DATA_TYPE_IN, 16)
+ in_data = vload16(0, (__global DATA_TYPE_IN *)in.ptr);
+
+#if defined(FIXED_POINT_POSITION)
+ vstore16(CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_IN, 16), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), FIXED_POINT_POSITION), 0, (__global DATA_TYPE_OUT *)out.ptr);
+#else /* FIXED_POINT_POSITION */
+ vstore16(CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)) << shift, 0, (__global DATA_TYPE_OUT *)out.ptr);
+#endif /* FIXED_POINT_POSITION */
}
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
new file mode 100644
index 0000000..9c2c3a5
--- /dev/null
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if defined(CONV_STRIDE_X)
+
+#if CONV_STRIDE_X == 1
+#define convolution1x3 convolution1x3_stride_1
+#elif CONV_STRIDE_X == 2
+#define convolution1x3 convolution1x3_stride_2
+#elif CONV_STRIDE_X == 3
+#define convolution1x3 convolution1x3_stride_3
+#else /* CONV_STRIDE_X */
+#error "Stride not supported"
+#endif /* CONV_STRIDE_X */
+
+/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
+ *
+ * @param[in] left_pixel Pointer to the left pixel.
+ * @param[in] left_coeff Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff Weight of the right pixel
+ *
+ * @return a float2 containing 2 convoluted values.
+ */
+inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
+ const float left_coeff,
+ const float middle_coeff,
+ const float right_coeff)
+{
+ float4 temp = vload4(0, (__global float *)left_pixel);
+
+ float2 left = CONVERT(temp.s01, float2);
+ float2 middle = CONVERT(temp.s12, float2);
+ float2 right = CONVERT(temp.s23, float2);
+
+ return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+}
+
+/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
+ *
+ * @param[in] left_pixel Pointer to the left pixel.
+ * @param[in] left_coeff Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff Weight of the right pixel
+ *
+ * @return a float2 containing 2 convoluted values.
+ */
+inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
+ const float left_coeff,
+ const float middle_coeff,
+ const float right_coeff)
+{
+ float4 temp0 = vload4(0, (__global float *)left_pixel);
+ float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
+
+ float2 left = CONVERT(temp0.s02, float2);
+ float2 middle = CONVERT(temp0.s13, float2);
+ float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
+
+ return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+}
+
+/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
+ *
+ * @param[in] left_pixel Pointer to the left pixel.
+ * @param[in] left_coeff Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff Weight of the right pixel
+ *
+ * @return a float2 containing 2 convoluted values.
+ */
+inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
+ const float left_coeff,
+ const float middle_coeff,
+ const float right_coeff)
+{
+ float4 temp0 = vload4(0, (__global float *)left_pixel);
+ float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
+
+ float2 left = CONVERT(temp0.s03, float2);
+ float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
+ float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
+
+ return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
+}
+
+/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
+ *
+ * Convolution matrix layout:
+ *
+ * [ mat0, mat1, mat2 ]\n
+ * [ mat3, mat4, mat5 ]\n
+ * [ mat6, mat7, mat8 ]\n
+ *
+ * @param[in] src A pointer to source Image structure
+ * @param[in] mat0 Coefficient from the convolution matrix
+ * @param[in] mat1 Coefficient from the convolution matrix
+ * @param[in] mat2 Coefficient from the convolution matrix
+ * @param[in] mat3 Coefficient from the convolution matrix
+ * @param[in] mat4 Coefficient from the convolution matrix
+ * @param[in] mat5 Coefficient from the convolution matrix
+ * @param[in] mat6 Coefficient from the convolution matrix
+ * @param[in] mat0 Coefficient from the convolution matrix
+ * @param[in] mat7 Coefficient from the convolution matrix
+ * @param[in] mat8 Coefficient from the convolution matrix
+ *
+ * @return a float2 containing 2 convoluted values.
+ */
+inline float2 convolution3x3(
+ Image *src,
+ const float mat0, const float mat1, const float mat2,
+ const float mat3, const float mat4, const float mat5,
+ const float mat6, const float mat7, const float mat8)
+{
+ float2 pixels;
+
+ pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
+ pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5);
+ pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8);
+
+ return pixels;
+}
+
+/** This function computes the horizontal integral of the image.
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: U8
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ */
+
+__kernel void depthwise_convolution_3x3(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), TENSOR3D_DECLARATION(weights))
+{
+ Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+
+ uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
+ float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
+ float3 weights_values1 = vload3(0, (__global float *)(weights.ptr + offset.s1));
+ float3 weights_values2 = vload3(0, (__global float *)(weights.ptr + offset.s2));
+
+ float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
+ weights_values1.s0, weights_values1.s1, weights_values1.s2,
+ weights_values2.s0, weights_values2.s1, weights_values2.s2);
+
+ vstore2(pixels, 0, (__global float *)dst.ptr);
+}
+
+#endif //defined(CONV_STRIDE_X)
+
+#if defined(SRC_WIDTH) && defined(DATA_TYPE)
+/** This kernel reshapes each of the tensor's low three dimensions to single rows.
+ *
+ * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depthwise_weights_reshape(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+ __global DATA_TYPE *input_ptr = (__global DATA_TYPE *)src.ptr;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
+
+ for(int i = 0; i < SRC_WIDTH; ++i, ++input_ptr)
+ {
+ *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *input_ptr;
+ }
+}
+#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
+
+#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_X, -DPAD_Y, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+
+__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ const int src_pixel_linear = get_global_id(1) * STRIDE_X;
+ const int full_length = SRC_WIDTH + 2 * PAD_X;
+ const int max_initial_x = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
+
+ const int src_x = -PAD_X + src_pixel_linear % max_initial_x;
+ const int src_y = -PAD_Y + src_pixel_linear / max_initial_x * STRIDE_Y;
+ const int src_z = get_global_id(2);
+
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;
+ __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
+
+ for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)
+ {
+ for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)
+ {
+ if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
+ {
+ *output_ptr = 0;
+ }
+ else
+ {
+ *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+ }
+ }
+ }
+}
+
+#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE)
+
+#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
+
+/** This kernel performs a reshaping of the output of the depthwise generic convolution.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depthwise_vector_to_tensor(
+ VECTOR_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Vector src = CONVERT_TO_VECTOR_STRUCT(src);
+
+ const int patch_size = CONV_WIDTH * CONV_HEIGHT;
+ const int id0 = get_global_id(0);
+ const int z = id0 / patch_size;
+ const int index2D = id0 - z * patch_size;
+
+ __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * dst_stride_x + index2D / CONV_WIDTH * dst_stride_y + z * dst_stride_z;
+ *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
+}
+
+#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
new file mode 100644
index 0000000..21e9c87
--- /dev/null
+++ b/src/core/CL/cl_kernels/dequantization_layer.cl
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This performs the dequantization of 8-bit unsigned integers to floating point.
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] min_max_ptr Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Suppported data types: F32.
+ * @param[in] min_max_stride_x Stride of the min/max vector in X dimension (in bytes)
+ * @param[in] min_max_step_x min_max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] min_max_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ */
+__kernel void dequantization_layer(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output),
+ VECTOR_DECLARATION(min_max))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+ Vector min_max = CONVERT_TO_VECTOR_STRUCT(min_max);
+
+ // min_max_value.s0 = min, min_max_value.s1 = max
+ const float2 min_max_value = vload2(0, (__global float *)min_max.ptr);
+
+ const float4 vmin = (float4)min_max_value.s0;
+ const float4 scale = (float4)((min_max_value.s1 - min_max_value.s0) / 255.0f);
+
+ // Load data
+ const uchar4 data = vload4(0, (__global uchar *)input.ptr);
+
+ // Dequantize
+ const float4 res = convert_float4(data) * scale + vmin;
+
+ // Store result
+ vstore4(res, 0, (__global float *)output.ptr);
+}
diff --git a/src/core/CL/cl_kernels/derivative.cl b/src/core/CL/cl_kernels/derivative.cl
index 0e810d2..cd2091e 100644
--- a/src/core/CL/cl_kernels/derivative.cl
+++ b/src/core/CL/cl_kernels/derivative.cl
@@ -52,29 +52,29 @@
#ifdef GRAD_X
,
IMAGE_DECLARATION(dst_gx)
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
,
IMAGE_DECLARATION(dst_gy)
-#endif
+#endif /* GRAD_Y */
)
{
Image src = CONVERT_TO_IMAGE_STRUCT(src);
#ifdef GRAD_X
Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif
+#endif /* GRAD_Y */
#ifdef GRAD_X
short16 l_data = convert_short16(vload16(0, offset(&src, -1, 0)));
short16 r_data = convert_short16(vload16(0, offset(&src, 1, 0)));
vstore16(r_data - l_data, 0, ((__global short *)dst_gx.ptr));
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
short16 t_data = convert_short16(vload16(0, offset(&src, 0, -1)));
short16 b_data = convert_short16(vload16(0, offset(&src, 0, 1)));
vstore16(b_data - t_data, 0, ((__global short *)dst_gy.ptr));
-#endif
+#endif /* GRAD_Y */
}
diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/direct_convolution1x1.cl
new file mode 100644
index 0000000..fb516dd
--- /dev/null
+++ b/src/core/CL/cl_kernels/direct_convolution1x1.cl
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+
+#define ADD_OP(a, b) ADD_SAT_OP_EXPAND((a), (b), DATA_TYPE_PROMOTED, 8)
+#define MUL_OP(a, b) MUL_SAT_OP_EXPAND(CONVERT((a), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), CONVERT((b), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), DATA_TYPE_PROMOTED, 8, FIXED_POINT_POSITION)
+
+// There is no need to have a larger intermediate type for qs32 because all the arguments are already promoted
+MULQ_SAT_IMPL(qs32x8, qs32x8)
+
+#else /* FIXED_POINT_POSITION */
+#undef CONVERT_SAT
+
+#define ADD_OP(a, b) ((a) + (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define CONVERT_SAT(a, b) ((a))
+
+#endif /* FIXED_POINT_POSITION */
+
+#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if STRIDE_X == 3
+#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size
+#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)
+#elif STRIDE_X == 2
+#define INPUT_PIXEL(data_size) extract_input_stride2
+#elif STRIDE_X == 1
+#define INPUT_PIXEL(data_size) extract_input_stride1
+#else /* STRIDE_X not equals 1, 2 or 3 */
+#error "Only support strides 1, 2 and 3"
+#endif /* STRIDE_X == 3 */
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
+{
+ return vload8(0, input_pixel);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ temp = vload16(0, input_pixel);
+ return temp.s02468ace;
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ temp1 = vload4(0, input_pixel);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ temp2 = vload4(0, input_pixel + 6);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ temp3 = vload4(0, input_pixel + 12);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ temp4 = vload4(0, input_pixel + 18);
+ return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ temp1 = vload8(0, input_pixel);
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ temp2 = vload8(0, input_pixel + 8);
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ temp3 = vload8(0, input_pixel + 16);
+ return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ temp1 = vload16(0, input_pixel);
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ temp2 = vload16(0, input_pixel + 12);
+ return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
+}
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution1x1(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+ VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+ unsigned int weights_stride_w)
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* defined(HAS_BIAS) */
+
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
+ pixels = 0;
+
+ const uint z_index = get_global_id(2);
+
+ weights.ptr += z_index * weights_stride_w;
+
+ for(int d = 0; d < WEIGHTS_DEPTH; ++d)
+ {
+ DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);
+ pixels = ADD_OP(pixels, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel));
+ src.ptr += src_stride_z;
+ weights.ptr += weights_stride_z;
+ }
+
+#ifdef HAS_BIAS
+ pixels = ADD_OP(pixels, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))));
+#endif /* defined(HAS_BIAS) */
+
+ vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \
+ ({ \
+ acc.s0 = mad(src.s0, weight_value, acc.s0); \
+ acc.s1 = mad(src.s1, weight_value, acc.s1); \
+ acc.s2 = mad(src.s2, weight_value, acc.s2); \
+ acc.s3 = mad(src.s3, weight_value, acc.s3); \
+ })
+
+/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases, -DHAS_BIAS must to be passed at compile
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution1x1_f32_bifrost(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+ VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+ unsigned int weights_stride_w)
+{
+ // Get the kernel index
+ const int kernel_index = get_global_id(2);
+
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ float4 acc0 = 0.0f;
+ float4 acc1 = 0.0f;
+ float4 acc2 = 0.0f;
+ float4 acc3 = 0.0f;
+
+ __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+ __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
+
+ for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+ {
+ // Load the weights
+ float weight = *((__global float *)weights_addr);
+
+ // Load values from row0 of input tensor
+ float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+ float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+ float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+ float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+ CONVOLUTION1x1_BIFROST(acc0, src0, weight);
+ CONVOLUTION1x1_BIFROST(acc1, src1, weight);
+ CONVOLUTION1x1_BIFROST(acc2, src2, weight);
+ CONVOLUTION1x1_BIFROST(acc3, src3, weight);
+
+ src_addr += src_stride_z;
+ weights_addr += weights_stride_z;
+ }
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+ float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+ acc0.s0 += bias;
+ acc0.s1 += bias;
+ acc0.s2 += bias;
+ acc0.s3 += bias;
+ acc1.s0 += bias;
+ acc1.s1 += bias;
+ acc1.s2 += bias;
+ acc1.s3 += bias;
+ acc2.s0 += bias;
+ acc2.s1 += bias;
+ acc2.s2 += bias;
+ acc2.s3 += bias;
+ acc3.s0 += bias;
+ acc3.s1 += bias;
+ acc3.s2 += bias;
+ acc3.s3 += bias;
+#endif /* defined(HAS_BIAS) */
+
+ vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+ vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+ vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+ vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/direct_convolution3x3.cl
new file mode 100644
index 0000000..d094eca
--- /dev/null
+++ b/src/core/CL/cl_kernels/direct_convolution3x3.cl
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+
+#define ADD_OP(a, b) ADD_SAT_OP_EXPAND((a), (b), DATA_TYPE_PROMOTED, 8)
+#define MUL_OP(a, b) MUL_SAT_OP_EXPAND(CONVERT((a), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), CONVERT((b), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)), DATA_TYPE_PROMOTED, 8, FIXED_POINT_POSITION)
+
+// There is no need to have a larger intermediate type for qs32 because all the arguments are already promoted
+MULQ_SAT_IMPL(qs32x8, qs32x8)
+
+#else /* FIXED_POINT_POSITION */
+
+#undef CONVERT_SAT
+
+#define ADD_OP(a, b) ((a) + (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define CONVERT_SAT(a, b) ((a))
+
+#endif /* FIXED_POINT_POSITION */
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X == 2 */
+
+#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 3) \
+ weights_values0 = vload3(0, weights_row_ptr); \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ src0 = vload8(0, src_row_ptr); \
+ VEC_DATA_TYPE(DATA_TYPE, 2) \
+ src1 = vload2(0, src_row_ptr + 8); \
+ \
+ acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \
+ acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \
+ acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
+ })
+
+#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 3) \
+ weights_values0 = vload3(0, weights_row_ptr); \
+ VEC_DATA_TYPE(DATA_TYPE, 16) \
+ src0 = vload16(0, src_row_ptr); \
+ DATA_TYPE src1 = *(src_row_ptr + 16); \
+ \
+ acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \
+ acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \
+ acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
+ })
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note This OpenCL kernel works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution3x3(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+ VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+ unsigned int weights_stride_w)
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
+ pixels0 = 0;
+
+ __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+ __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
+
+ const int kernel_index = get_global_id(2);
+ weights_addr += kernel_index * weights_stride_w;
+
+ for(int d = 0; d < WEIGHTS_DEPTH; ++d)
+ {
+ CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
+ CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+ CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+
+ src_addr += src_stride_z;
+ weights_addr += weights_stride_z;
+ }
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+ pixels0 = ADD_OP(pixels0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))));
+#endif /* defined(HAS_BIAS) */
+
+ vstore8(CONVERT_SAT(pixels0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \
+ ({ \
+ acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0); \
+ acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1); \
+ acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2); \
+ acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3); \
+ acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0); \
+ acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1); \
+ acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2); \
+ acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3); \
+ acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0); \
+ acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1); \
+ acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2); \
+ acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3); \
+ })
+
+/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases, -DHAS_BIAS must to be passed at compile
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution3x3_f32_bifrost(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+ VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+ unsigned int weights_stride_w)
+{
+ // Get the kernel index
+ const int kernel_index = get_global_id(2);
+
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ float4 pixels0 = 0;
+ float4 pixels1 = 0;
+ float4 pixels2 = 0;
+
+ __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+ __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
+
+ // Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor
+
+ for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+ {
+ // Load the weights
+ float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+ float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+ float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+ float4 src0;
+ float2 src1;
+
+ // Load values from row0 of input tensor
+ src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+ src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4);
+
+ CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row0);
+
+ // Load values from row1 of input tensor
+ src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+ src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4);
+
+ // Accumulate
+ CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row1);
+ CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row0);
+
+ // Load values from row2 of input tensor
+ src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+ src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4);
+
+ // Accumulate
+ CONVOLUTION1x3_BIFROST(pixels0, src0, src1, weights_row2);
+ CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row1);
+ CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row0);
+
+ // Load values from row3 of input tensor
+ src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+ src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4);
+
+ // Accumulate
+ CONVOLUTION1x3_BIFROST(pixels1, src0, src1, weights_row2);
+ CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row1);
+
+ // Row4
+ src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
+ src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4);
+
+ // Accumulate
+ CONVOLUTION1x3_BIFROST(pixels2, src0, src1, weights_row2);
+
+ src_addr += src_stride_z;
+ weights_addr += weights_stride_z;
+ }
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+ float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+ pixels0 += (float4)bias;
+ pixels1 += (float4)bias;
+ pixels2 += (float4)bias;
+#endif /* defined(HAS_BIAS) */
+
+ vstore4(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+ vstore4(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+ vstore4(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/direct_convolution5x5.cl
new file mode 100644
index 0000000..496da97
--- /dev/null
+++ b/src/core/CL/cl_kernels/direct_convolution5x5.cl
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#undef CONVERT_SAT
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X == 2 */
+
+#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ weights_values0 = vload4(0, weights_row_ptr); \
+ DATA_TYPE weights_value1 = *(weights_row_ptr + 4); \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ src0 = vload8(0, src_row_ptr); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ src1 = vload4(0, src_row_ptr + 8); \
+ \
+ acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \
+ })
+
+#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ weights_values0 = vload4(0, weights_row_ptr); \
+ DATA_TYPE weights_value1 = *(weights_row_ptr + 4); \
+ VEC_DATA_TYPE(DATA_TYPE, 16) \
+ src0 = vload16(0, src_row_ptr); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ src1 = vload4(0, src_row_ptr + 16); \
+ acc += src0.even * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0; \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
+ \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
+ acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1; \
+ })
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution5x5(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+ VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+ unsigned int weights_stride_w)
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ pixels0 = 0;
+
+ __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+ __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
+
+ const int kernel_index = get_global_id(2);
+ weights_addr += kernel_index * weights_stride_w;
+
+ for(int d = 0; d < WEIGHTS_DEPTH; ++d)
+ {
+ CONVOLUTION1x5(pixels0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
+ CONVOLUTION1x5(pixels0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+ CONVOLUTION1x5(pixels0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+ CONVOLUTION1x5(pixels0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
+ CONVOLUTION1x5(pixels0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
+
+ src_addr += src_stride_z;
+ weights_addr += weights_stride_z;
+ }
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+ pixels0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index)));
+#endif /* defined(HAS_BIAS) */
+
+ vstore8(pixels0, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x5_BIFROST(acc, src0, weights_row00, weights_row01) \
+ ({ \
+ acc.s0 = mad(src0.s0, weights_row00.s0, acc.s0); \
+ acc.s1 = mad(src0.s1, weights_row00.s0, acc.s1); \
+ acc.s2 = mad(src0.s2, weights_row00.s0, acc.s2); \
+ acc.s3 = mad(src0.s3, weights_row00.s0, acc.s3); \
+ acc.s0 = mad(src0.s1, weights_row00.s1, acc.s0); \
+ acc.s1 = mad(src0.s2, weights_row00.s1, acc.s1); \
+ acc.s2 = mad(src0.s3, weights_row00.s1, acc.s2); \
+ acc.s3 = mad(src0.s4, weights_row00.s1, acc.s3); \
+ acc.s0 = mad(src0.s2, weights_row00.s2, acc.s0); \
+ acc.s1 = mad(src0.s3, weights_row00.s2, acc.s1); \
+ acc.s2 = mad(src0.s4, weights_row00.s2, acc.s2); \
+ acc.s3 = mad(src0.s5, weights_row00.s2, acc.s3); \
+ acc.s0 = mad(src0.s3, weights_row00.s3, acc.s0); \
+ acc.s1 = mad(src0.s4, weights_row00.s3, acc.s1); \
+ acc.s2 = mad(src0.s5, weights_row00.s3, acc.s2); \
+ acc.s3 = mad(src0.s6, weights_row00.s3, acc.s3); \
+ acc.s0 = mad(src0.s4, weights_row01, acc.s0); \
+ acc.s1 = mad(src0.s5, weights_row01, acc.s1); \
+ acc.s2 = mad(src0.s6, weights_row01, acc.s2); \
+ acc.s3 = mad(src0.s7, weights_row01, acc.s3); \
+ })
+
+/** An optimized direct convolution 5x5 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution5x5_f32_bifrost(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+ VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+ unsigned int weights_stride_w)
+{
+ // Get the kernel index
+ const int kernel_index = get_global_id(2);
+
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ float4 pixels0 = 0.0f;
+ float4 pixels1 = 0.0f;
+
+ __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+ __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
+
+ // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
+
+ for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+ {
+ // Load the weights from row0 and row1
+ float4 weights_row00 = vload4(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+ float weights_row01 = *((__global float *)(weights_addr + 0 * weights_stride_y) + 4);
+ float4 weights_row10 = vload4(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+ float weights_row11 = *((__global float *)(weights_addr + 1 * weights_stride_y) + 4);
+ float8 src0;
+
+ // Load values from row0 of input tensor
+ src0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));
+
+ // Accumulate
+ CONVOLUTION1x5_BIFROST(pixels0, src0, weights_row00, weights_row01);
+
+ // Load values from row1 of input tensor
+ src0 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));
+
+ // Accumulate
+ CONVOLUTION1x5_BIFROST(pixels0, src0, weights_row10, weights_row11);
+ CONVOLUTION1x5_BIFROST(pixels1, src0, weights_row00, weights_row01);
+
+ // Load values from row2 of input tensor
+ src0 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));
+
+ // Load weights from row2
+ weights_row00 = vload4(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+ weights_row01 = *((__global float *)(weights_addr + 2 * weights_stride_y) + 4);
+
+ // Accumulate
+ CONVOLUTION1x5_BIFROST(pixels0, src0, weights_row00, weights_row01);
+ CONVOLUTION1x5_BIFROST(pixels1, src0, weights_row10, weights_row11);
+
+ // Load values from row3 of input tensor
+ src0 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+ // Load weights from row3
+ weights_row10 = vload4(0, (__global float *)(weights_addr + 3 * weights_stride_y));
+ weights_row11 = *((__global float *)(weights_addr + 3 * weights_stride_y) + 4);
+
+ // Accumulate
+ CONVOLUTION1x5_BIFROST(pixels0, src0, weights_row10, weights_row11);
+ CONVOLUTION1x5_BIFROST(pixels1, src0, weights_row00, weights_row01);
+
+ // Load values from row4 of input tensor
+ src0 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));
+
+ // Load weights from row4
+ weights_row00 = vload4(0, (__global float *)(weights_addr + 4 * weights_stride_y));
+ weights_row01 = *((__global float *)(weights_addr + 4 * weights_stride_y) + 4);
+
+ CONVOLUTION1x5_BIFROST(pixels0, src0, weights_row00, weights_row01);
+ CONVOLUTION1x5_BIFROST(pixels1, src0, weights_row10, weights_row11);
+
+ // Load values from row5 of input tensor
+ src0 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));
+
+ // Accumulate
+ CONVOLUTION1x5_BIFROST(pixels1, src0, weights_row00, weights_row01);
+
+ src_addr += src_stride_z;
+ weights_addr += weights_stride_z;
+ }
+
+#ifdef HAS_BIAS
+ Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+ float4 bias = (float4) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+ pixels0 += bias;
+ pixels1 += bias;
+#endif /* defined(HAS_BIAS) */
+
+ vstore4(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+ vstore4(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/fast_corners.cl b/src/core/CL/cl_kernels/fast_corners.cl
index 470d14a..3e1929c 100644
--- a/src/core/CL/cl_kernels/fast_corners.cl
+++ b/src/core/CL/cl_kernels/fast_corners.cl
@@ -206,12 +206,11 @@
return;
}
-#ifndef USE_MAXSUPPRESSION
- *out.ptr = 1;
-#else
-
+#ifdef USE_MAXSUPPRESSION
*out.ptr = compute_strength(p, in.ptr, input_stride_y, threshold);
-#endif
+#else /* USE_MAXSUPPRESSION */
+ *out.ptr = 1;
+#endif /* USE_MAXSUPPRESSION */
}
/** Copy result to Keypoint buffer and count number of corners
@@ -240,7 +239,7 @@
{
return;
}
-#endif
+#endif /* UPDATE_NUMBER */
Image in = CONVERT_TO_IMAGE_STRUCT(input);
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
index df63586..fbd4f6a 100644
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ b/src/core/CL/cl_kernels/fill_border.cl
@@ -23,6 +23,10 @@
*/
#include "helpers.h"
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+#endif /* FIXED_POINT_POSITION */
+
/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
*
* @attention The DATA_TYPE needs to be passed at the compile time.
@@ -36,18 +40,20 @@
* @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] buf_stride_z Stride between images if batching images (in bytes)
+ * @param[in] buf_step_z buf_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] width Width of the valid region of the image
* @param[in] height Height of the valid region of the image
* @param[in] start_pos XY coordinate indicating the start point of the valid region
*/
__kernel void fill_image_borders_replicate(
- IMAGE_DECLARATION(buf),
+ TENSOR3D_DECLARATION(buf),
uint width,
uint height,
int2 start_pos)
{
- Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf);
+ Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
// Update pointer to point to the starting point of the valid region
buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
@@ -109,6 +115,8 @@
* @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] buf_stride_z Stride between images if batching images (in bytes)
+ * @param[in] buf_step_z buf_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] width Width of the valid region of the image
* @param[in] height Height of the valid region of the image
@@ -116,13 +124,13 @@
* @param[in] constant_value Constant value to use to fill the edges
*/
__kernel void fill_image_borders_constant(
- IMAGE_DECLARATION(buf),
+ TENSOR3D_DECLARATION(buf),
uint width,
uint height,
int2 start_pos,
DATA_TYPE constant_value)
{
- Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf);
+ Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
// Update pointer to point to the starting point of the valid region
buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
diff --git a/src/core/CL/cl_kernels/fixed_point.h b/src/core/CL/cl_kernels/fixed_point.h
new file mode 100644
index 0000000..5476a6e
--- /dev/null
+++ b/src/core/CL/cl_kernels/fixed_point.h
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_FIXED_POINT_H
+#define ARM_COMPUTE_FIXED_POINT_H
+
+#define TYPE_ALIAS(type, alias) \
+ typedef type alias; \
+ typedef type alias##x##1; \
+ typedef type##2 alias##x##2; \
+ typedef type##3 alias##x##3; \
+ typedef type##4 alias##x##4; \
+ typedef type##8 alias##x##8; \
+ typedef type##16 alias##x##16;
+
+TYPE_ALIAS(char, qs8)
+TYPE_ALIAS(short, qs16)
+TYPE_ALIAS(int, qs32)
+
+#define qs8_MIN ((char)CHAR_MIN)
+#define qs8_MAX ((char)CHAR_MAX)
+#define qs16_MIN ((short)SHRT_MIN)
+#define qs16_MAX ((short)SHRT_MAX)
+#define qs32_MIN ((int)INT_MIN)
+#define qs32_MAX ((int)INT_MAX)
+
+#define qu8_MIN ((uchar)0)
+#define qu8_MAX ((uchar)UCHAR_MAX)
+#define qu16_MIN ((ushort)0)
+#define qu16_MAX ((ushort)USHRT_MAX)
+#define qu32_MIN ((uint)0)
+#define qu32_MAX ((uint)UINT_MAX)
+
+#define qs8_TYPE char
+#define qs8x1_TYPE char
+#define qs8x2_TYPE char2
+#define qs8x3_TYPE char3
+#define qs8x4_TYPE char4
+#define qs8x8_TYPE char8
+#define qs8x16_TYPE char16
+
+#define qs16_TYPE short
+#define qs16x1_TYPE short
+#define qs16x2_TYPE short2
+#define qs16x3_TYPE short3
+#define qs16x4_TYPE short4
+#define qs16x8_TYPE short8
+#define qs16x16_TYPE short16
+
+#define qs32_TYPE int
+#define qs32x1_TYPE int
+#define qs32x2_TYPE int2
+#define qs32x3_TYPE int3
+#define qs32x4_TYPE int4
+#define qs32x8_TYPE int8
+#define qs32x16_TYPE int16
+
+/* All internal constants are represented in the maximum supported fixed point format (QS16),
+ * thus we define an additional shift parameter required to convert the constant
+ * from the maximum supported format to the require one.
+ */
+#define qs8_SHIFT 8
+#define qs16_SHIFT 0
+
+#undef VEC_DATA_TYPE_STR
+#undef VEC_DATA_TYPE
+#undef CONVERT_STR
+#undef CONVERT
+#undef CONVERT_SAT_STR
+#undef CONVERT_SAT
+
+#define VEC_DATA_TYPE_STR(type, size) type##x##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR3(x, type, rtype) (convert_##rtype((x)))
+#define CONVERT_STR2(x, type, rtype) CONVERT_STR3(x, type, rtype)
+#define CONVERT_STR(x, type) CONVERT_STR2(x, type, type##_TYPE)
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR3(x, type, rtype) (convert_##rtype##_sat((x)))
+#define CONVERT_SAT_STR2(x, type, rtype) CONVERT_SAT_STR3(x, type, rtype)
+#define CONVERT_SAT_STR(x, type) CONVERT_SAT_STR2(x, type, type##_TYPE)
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+/** Computes saturating absolute value of fixed point vector.
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point absolute value.
+ */
+#define ABSQ_SAT_IMPL(type) \
+ inline type abs_##type##_sat(type VopA) \
+ { \
+ return CONVERT_SAT(abs(VopA), type); \
+ }
+
+ABSQ_SAT_IMPL(qs8x16)
+ABSQ_SAT_IMPL(qs16x8)
+
+#define ABS_SAT_OP_EXPAND_STR(a, type, size) abs_##type##x##size##_sat((a))
+#define ABS_SAT_OP_EXPAND(a, type, size) ABS_SAT_OP_EXPAND_STR(a, type, size)
+
+/** Computes max of fixed point types.
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point maximum.
+ */
+#define MAXQ_IMPL(type) \
+ inline type max_##type(type VopA, type VopB) \
+ { \
+ return max(VopA, VopB); \
+ }
+
+MAXQ_IMPL(qs8x1)
+MAXQ_IMPL(qs8x2)
+MAXQ_IMPL(qs8x4)
+MAXQ_IMPL(qs8x8)
+MAXQ_IMPL(qs8x16)
+MAXQ_IMPL(qs16x1)
+MAXQ_IMPL(qs16x2)
+MAXQ_IMPL(qs16x4)
+MAXQ_IMPL(qs16x8)
+MAXQ_IMPL(qs16x16)
+
+#define MAX_OP_EXPAND_STR(a, b, type, size) max_##type##x##size((a), (b))
+#define MAX_OP_EXPAND(a, b, type, size) MAX_OP_EXPAND_STR(a, b, type, size)
+
+/** Computes saturated addition of fixed point types.
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point addition. The result is saturated in case of overflow
+ */
+#define ADDQ_SAT_IMPL(type) \
+ inline type add_sat_##type(type VopA, type VopB) \
+ { \
+ return add_sat(VopA, VopB); \
+ }
+
+ADDQ_SAT_IMPL(qs8x1)
+ADDQ_SAT_IMPL(qs8x2)
+ADDQ_SAT_IMPL(qs8x4)
+ADDQ_SAT_IMPL(qs8x8)
+ADDQ_SAT_IMPL(qs8x16)
+ADDQ_SAT_IMPL(qs16x1)
+ADDQ_SAT_IMPL(qs16x2)
+ADDQ_SAT_IMPL(qs16x4)
+ADDQ_SAT_IMPL(qs16x8)
+ADDQ_SAT_IMPL(qs16x16)
+ADDQ_SAT_IMPL(qs32x1)
+ADDQ_SAT_IMPL(qs32x2)
+ADDQ_SAT_IMPL(qs32x4)
+ADDQ_SAT_IMPL(qs32x8)
+ADDQ_SAT_IMPL(qs32x16)
+
+#define ADD_SAT_OP_EXPAND_STR(a, b, type, size) add_sat_##type##x##size((a), (b))
+#define ADD_SAT_OP_EXPAND(a, b, type, size) ADD_SAT_OP_EXPAND_STR(a, b, type, size)
+
+/** Computes saturated subtraction of fixed point types.
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point subtraction. The result is saturated in case of overflow
+ */
+#define SUBQ_SAT_IMPL(type) \
+ inline type sub_sat_##type(type VopA, type VopB) \
+ { \
+ return sub_sat(VopA, VopB); \
+ }
+
+SUBQ_SAT_IMPL(qs8x1)
+SUBQ_SAT_IMPL(qs8x2)
+SUBQ_SAT_IMPL(qs8x4)
+SUBQ_SAT_IMPL(qs8x8)
+SUBQ_SAT_IMPL(qs8x16)
+SUBQ_SAT_IMPL(qs16x1)
+SUBQ_SAT_IMPL(qs16x2)
+SUBQ_SAT_IMPL(qs16x4)
+SUBQ_SAT_IMPL(qs16x8)
+SUBQ_SAT_IMPL(qs16x16)
+
+#define SUB_SAT_OP_EXPAND_STR(a, b, type, size) sub_sat_##type##x##size((a), (b))
+#define SUB_SAT_OP_EXPAND(a, b, type, size) SUB_SAT_OP_EXPAND_STR(a, b, type, size)
+
+/* Multiply of two fixed point numbers
+ *
+ * @param[in] type the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiplication.
+ */
+#define MULQ_IMPL(type, itype) \
+ inline type mul_##type(type VopA, type VopB, int fixed_point_position) \
+ { \
+ itype round_val = (itype)(1 << (fixed_point_position - 1)); \
+ itype res = CONVERT((VopA), itype) * CONVERT((VopB), itype) + round_val; \
+ return CONVERT((res >> (itype)fixed_point_position), type); \
+ }
+
+MULQ_IMPL(qs8x8, qs16x8)
+MULQ_IMPL(qs16x8, qs32x8)
+MULQ_IMPL(qs8x16, qs16x16)
+MULQ_IMPL(qs16x16, qs32x16)
+
+#define MUL_OP_EXPAND_STR(a, b, type, size, position) mul_##type##x##size((a), (b), (position))
+#define MUL_OP_EXPAND(a, b, type, size, position) MUL_OP_EXPAND_STR(a, b, type, size, position)
+
+/* Saturate multiply of two fixed point numbers
+ *
+ * @param[in] type the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiplication. The result is saturated in case of overflow
+ */
+#define MULQ_SAT_IMPL(type, itype) \
+ inline type mul_sat_##type(type VopA, type VopB, int fixed_point_position) \
+ { \
+ itype round_val = (itype)(1 << (fixed_point_position - 1)); \
+ itype res = mad_sat(CONVERT((VopA), itype), CONVERT((VopB), itype), round_val); \
+ return CONVERT_SAT((res >> (itype)fixed_point_position), type); \
+ }
+
+MULQ_SAT_IMPL(qs8x1, qs16x1)
+MULQ_SAT_IMPL(qs8x2, qs16x2)
+MULQ_SAT_IMPL(qs8x3, qs16x3)
+MULQ_SAT_IMPL(qs8x4, qs16x4)
+MULQ_SAT_IMPL(qs8x8, qs16x8)
+MULQ_SAT_IMPL(qs8x16, qs16x16)
+MULQ_SAT_IMPL(qs16x1, qs32x1)
+MULQ_SAT_IMPL(qs16x2, qs32x2)
+MULQ_SAT_IMPL(qs16x3, qs32x3)
+MULQ_SAT_IMPL(qs16x4, qs32x4)
+MULQ_SAT_IMPL(qs16x8, qs32x8)
+MULQ_SAT_IMPL(qs16x16, qs32x16)
+
+#define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) mul_sat_##type##x##size((a), (b), (position))
+#define MUL_SAT_OP_EXPAND(a, b, type, size, position) MUL_SAT_OP_EXPAND_STR(a, b, type, size, position)
+
+/** Saturate multiply-accumulate
+ *
+ * @param[in] type the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiply-accumulate. The result is saturated in case of overflow
+ */
+#define MLAQ_SAT_IMPL(type, itype) \
+ type mla_sat_##type(type VopA, type VopB, type VopC, int fixed_point_position) \
+ { \
+ itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), (itype)(1 << (fixed_point_position - 1))); \
+ return add_sat(VopA, CONVERT_SAT(res >> (itype)fixed_point_position, type)); \
+ }
+
+MLAQ_SAT_IMPL(qs8x8, qs16x8)
+MLAQ_SAT_IMPL(qs8x16, qs16x16)
+MLAQ_SAT_IMPL(qs16x8, qs32x8)
+
+#define MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) mla_sat_##type##x##size((a), (b), (c), (position))
+#define MLA_SAT_OP_EXPAND(a, b, c, type, size, position) MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
+
+/** Saturate multiply-accumulate long
+ *
+ * @param[in] type the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiply-accumulate long. The result is saturated in case of overflow
+ */
+#define MLALQ_SAT_IMPL(type, itype) \
+ itype mlal_sat_##type(itype VopA, type VopB, type VopC, int fixed_point_position) \
+ { \
+ itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), (itype)(1 << (fixed_point_position - 1))); \
+ return add_sat(VopA, res >> (itype)fixed_point_position); \
+ }
+
+MLALQ_SAT_IMPL(qs8x8, qs16x8)
+MLALQ_SAT_IMPL(qs16x8, qs32x8)
+
+#define MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) mlal_sat_##type##x##size((a), (b), (c), (position))
+#define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
+
+/** Saturate division of two fixed point vectors
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point division. The result is saturated in case of overflow
+ */
+#define DIVQ_SAT_IMPL(stype, type, itype) \
+ inline type div_sat_##type(type VopA, type VopB, int fixed_point_position) \
+ { \
+ itype conv_a = CONVERT((VopA), itype); \
+ itype denominator = CONVERT((VopB), itype); \
+ itype numerator = conv_a << (itype)(fixed_point_position); \
+ itype res = select((itype)(numerator / denominator), select((itype)stype##_MAX, (itype)stype##_MIN, (itype)(conv_a < (itype)0)), (itype)(denominator == (itype)0)); \
+ return CONVERT_SAT((res), type); \
+ }
+
+DIVQ_SAT_IMPL(qs8, qs8x16, qs16x16)
+DIVQ_SAT_IMPL(qs16, qs16x8, qs32x8)
+DIVQ_SAT_IMPL(qs16, qs16x16, qs32x16)
+DIVQ_SAT_IMPL(qs8, qs8, qs16)
+DIVQ_SAT_IMPL(qs16, qs16, qs32)
+
+#define DIV_SAT_OP_EXPAND_STR(a, b, type, position) div_sat_##type((a), (b), (position))
+#define DIV_SAT_OP_EXPAND(a, b, type, position) DIV_SAT_OP_EXPAND_STR(a, b, type, position)
+
+#define DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) div_sat_##type##x##size((a), (b), (position))
+#define DIV_SAT_OP_VEC_EXPAND(a, b, type, size, position) DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position)
+
+/** Saturate exponential of a fixed point vector
+ *
+ * @note Implemented approach uses taylor polynomial to approximate the exponential function.
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type the actual data type.
+ * @param[in] size the number of the calculated elements.
+ *
+ * @return The result of the fixed point exponential. The result is saturated in case of overflow
+ */
+#define EXPQ_IMPL(stype, type, size) \
+ inline type exp_sat_##type(type VopA, int fixed_point_position) \
+ { \
+ type const_one = (type)(1 << (fixed_point_position)); \
+ type ln2 = (type)((((0x58B9 >> (14 - fixed_point_position))) + 1) >> 1); \
+ type inv_ln2 = (type)((((0x38AA >> (14 - fixed_point_position)) + 1) >> 1)) | const_one; \
+ type A = (type)(((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1); \
+ type B = (type)(((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1); \
+ type C = (type)(((0x1693 >> (14 - fixed_point_position)) + 1) >> 1); \
+ type D = (type)(((0x0592 >> (14 - fixed_point_position)) + 1) >> 1); \
+ type m = MUL_SAT_OP_EXPAND(VopA, inv_ln2, stype, size, fixed_point_position); \
+ type dec_m = m >> (type)fixed_point_position; \
+ type alpha = MUL_SAT_OP_EXPAND(dec_m << (type)fixed_point_position, ln2, stype, size, fixed_point_position); \
+ alpha = CONVERT(abs_diff(VopA, alpha), type); \
+ type sum = add_sat(MUL_SAT_OP_EXPAND(alpha, D, stype, size, fixed_point_position), C); \
+ sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), B); \
+ sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), A); \
+ sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), const_one); \
+ return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0), clz(sum) > dec_m); /* Saturate result if needed */ \
+ }
+
+EXPQ_IMPL(qs8, qs8x16, 16)
+EXPQ_IMPL(qs16, qs16x8, 8)
+EXPQ_IMPL(qs16, qs16x16, 16)
+
+#define EXP_OP_EXPAND_STR(a, type, size, position) exp_sat_##type##x##size((a), (position))
+#define EXP_OP_EXPAND(a, type, size, position) EXP_OP_EXPAND_STR(a, type, size, position)
+
+/** Saturate logarithm of a fixed point vector
+ *
+ * @note Implemented approach uses taylor polynomial to approximate the logarithm function.
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type the actual data type.
+ * @param[in] size the number of the calculated elements.
+ *
+ * @return The result of the fixed point logarithm. The result is saturated in case of overflow
+ */
+#define LOGQ_IMPL(stype, type, size) \
+ inline type log_sat_##type(type VopA, int fixed_point_position) \
+ { \
+ type const_one = (type)(1 << (fixed_point_position)); \
+ type ln2 = (type)(0x58B9 >> (15 - fixed_point_position)); /* 1.4384189 */ \
+ type A = (type)(0x5C0F >> (14 - fixed_point_position)); /* 1.4384189 */ \
+ type B = -(type)(0x56AE >> (15 - fixed_point_position)); /* -0.6771900 */ \
+ type C = (type)(0x2933 >> (15 - fixed_point_position)); /* 0.3218538 */ \
+ type D = -(type)(0x0AA7 >> (15 - fixed_point_position)); /* -0.0832229 */ \
+ type inter_a = select(VopA, DIV_SAT_OP_VEC_EXPAND(const_one, VopA, stype, size, fixed_point_position), VopA < const_one); \
+ type shift_val = (type)(15 - stype##_SHIFT) - clz(inter_a >> (type)fixed_point_position); \
+ inter_a = inter_a >> shift_val; \
+ inter_a = sub_sat(inter_a, const_one); \
+ type sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, D, stype, size, fixed_point_position), C); \
+ sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), B); \
+ sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), A); \
+ sum = MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position); \
+ sum = MUL_SAT_OP_EXPAND(add_sat(sum, shift_val << (type)fixed_point_position), ln2, stype, size, fixed_point_position); \
+ return select(select(sum, -sum, VopA < const_one), (type)0, VopA < (type)0); /* Saturate result if needed */ \
+ }
+
+LOGQ_IMPL(qs8, qs8x16, 16)
+LOGQ_IMPL(qs16, qs16x8, 8)
+LOGQ_IMPL(qs16, qs16x16, 16)
+
+#define LOG_OP_EXPAND_STR(a, type, size, position) log_sat_##type##x##size((a), (position))
+#define LOG_OP_EXPAND(a, type, size, position) LOG_OP_EXPAND_STR(a, type, size, position)
+
+/** Saturate inverse square root of a fixed point vector
+ *
+ * @note Implemented approach uses Newton's method to approximate the inverse square root function.
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type the actual data type.
+ * @param[in] size the number of the calculated elements.
+ *
+ * @return The result of the fixed point inverse square root. The result is saturated in case of overflow
+ */
+#define INVSQRTQ_IMPL(stype, type, size) \
+ inline type invsqrt_sat_##type(type VopA, int fixed_point_position) \
+ { \
+ type const_three = (type)(3 << (fixed_point_position)); \
+ type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position); \
+ type temp = select((type)(VopA >> shift_value), select((type)stype##_MAX, (type)(VopA << (-shift_value)), (type)(clz(VopA) > (-shift_value))), (type)(shift_value < (type)0)); \
+ type x = temp; \
+ x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
+ x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
+ x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
+ if(sizeof((stype)(1)) > 1) /* Perform more iterations if datatype is QS16 */ \
+ { \
+ x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
+ x = MUL_SAT_OP_EXPAND(x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, fixed_point_position), temp, stype, size, fixed_point_position)), stype, size, fixed_point_position) >> 1; \
+ } \
+ type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0); \
+ return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2), (type)(clz(x) > shift_value2)), (type)(shift_value < (type)0)); /* Saturate result if needed */ \
+ }
+
+INVSQRTQ_IMPL(qs8, qs8x1, 1)
+INVSQRTQ_IMPL(qs16, qs16x1, 1)
+INVSQRTQ_IMPL(qs8, qs8x16, 16)
+INVSQRTQ_IMPL(qs16, qs16x8, 8)
+
+#define INVSQRT_OP_EXPAND_STR(a, type, size, position) invsqrt_sat_##type##x##size((a), (position))
+#define INVSQRT_OP_EXPAND(a, type, size, position) INVSQRT_OP_EXPAND_STR(a, type, size, position)
+
+/** Saturate hyperbolic tangent of a fixed point vector
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type the actual data type.
+ * @param[in] size the number of the calculated elements.
+ *
+ * @return The result of the fixed point hyperbolic tangent. The result is saturated in case of overflow
+ */
+#define TANHQ_IMPL(stype, type, size) \
+ inline type tanh_sat_##type(type VopA, int fixed_point_position) \
+ { \
+ type const_one = (type)(1 << (fixed_point_position)); \
+ type const_two = (type)(2 << (fixed_point_position)); \
+ type exp2x = EXP_OP_EXPAND(MUL_SAT_OP_EXPAND(const_two, VopA, stype, size, fixed_point_position), stype, size, fixed_point_position); \
+ type num = SUB_SAT_OP_EXPAND(exp2x, const_one, stype, size); \
+ type den = ADD_SAT_OP_EXPAND(exp2x, const_one, stype, size); \
+ return DIV_SAT_OP_VEC_EXPAND(num, den, stype, size, fixed_point_position); \
+ }
+
+TANHQ_IMPL(qs8, qs8x16, 16)
+TANHQ_IMPL(qs16, qs16x8, 8)
+
+#define TANH_OP_EXPAND_STR(a, type, size, position) tanh_sat_##type##x##size((a), (position))
+#define TANH_OP_EXPAND(a, type, size, position) TANH_OP_EXPAND_STR(a, type, size, position)
+
+#define floatx16 float16
+#define float16_TYPE float16
+
+#define CONVERTQ_DOWN_IMPL(in_type, out_type) \
+ inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \
+ { \
+ return CONVERT(a * (1 << fixed_point_position) + select((in_type)-0.5, (in_type)0.5, isgreater(a, (in_type)0)), out_type); \
+ }
+
+CONVERTQ_DOWN_IMPL(float16, qs8x16)
+CONVERTQ_DOWN_IMPL(float16, qs16x16)
+
+#define CONVERTQ_DOWN_SAT_IMPL(in_type, out_type) \
+ inline out_type convert_##out_type##_##in_type##_sat(in_type a, int fixed_point_position) \
+ { \
+ return CONVERT_SAT(a * (1 << fixed_point_position) + select((in_type)-0.5, (in_type)0.5, isgreater(a, (in_type)0)), out_type); \
+ }
+
+CONVERTQ_DOWN_SAT_IMPL(float16, qs8x16)
+CONVERTQ_DOWN_SAT_IMPL(float16, qs16x16)
+
+#define CONVERTQ_UP_IMPL(in_type, out_type) \
+ inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \
+ { \
+ return CONVERT(a, out_type) / (1 << fixed_point_position); \
+ }
+
+CONVERTQ_UP_IMPL(qs8x16, float16)
+CONVERTQ_UP_IMPL(qs16x16, float16)
+
+#define SQCVT_SAT_IMPL(type) \
+ inline type sqcvt_##type##_sat(float a, int fixed_point_position) \
+ { \
+ return CONVERT_SAT((a * (1 << fixed_point_position) + ((a < 0) ? -0.5f : 0.5f)), type); \
+ }
+
+SQCVT_SAT_IMPL(qs8)
+SQCVT_SAT_IMPL(qs16)
+
+#define SQCVT_SAT_OP_EXPAND_STR(a, type, position) sqcvt_##type##_sat((a), (position))
+#define SQCVT_SAT_OP_EXPAND(a, type, position) SQCVT_SAT_OP_EXPAND_STR((a), type, position)
+
+#endif // ARM_COMPUTE_FIXED_POINT_H
diff --git a/src/core/CL/cl_kernels/floor.cl b/src/core/CL/cl_kernels/floor.cl
new file mode 100644
index 0000000..e967e6b
--- /dev/null
+++ b/src/core/CL/cl_kernels/floor.cl
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform a floor operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void floor_layer(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VSTORE(VEC_SIZE)
+ (floor(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr);
+}
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index caf6e3f..35a2e47 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -23,55 +23,59 @@
*/
#include "helpers.h"
+#ifdef FIXED_POINT_POSITION
+#include "fixed_point.h"
+#endif // FIXED_POINT_POSITION
+
/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
*
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32
* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_transpose1x4_f32(IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst))
+__kernel void gemm_transpose1x4(IMAGE_DECLARATION(src),
+ IMAGE_DECLARATION(dst))
{
uint x = get_global_id(0);
uint y = get_global_id(1);
- /* Compute address for Matrix B - source */
+ // Compute address for Matrix B - source
Image src = CONVERT_TO_IMAGE_STRUCT(src);
- /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+ // Compute address for Matrix B transposed - destination. X and Y are swapped
uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
- float4 b0 = vload4(0, (__global float *)src.ptr);
+ uint4 b0 = vload4(0, (__global uint *)src.ptr);
- vstore4(b0, 0, (__global float *)(dst_ptr + dst_addr_in_bytes));
+ vstore4(b0, 0, (__global uint *)(dst_ptr + dst_addr_in_bytes));
}
/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
*
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_transpose1x8_f16(IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst))
+__kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
+ IMAGE_DECLARATION(dst))
{
uint x = get_global_id(0);
uint y = get_global_id(1);
@@ -82,28 +86,28 @@
/* Compute address for Matrix B transposed - destination. X and Y are swapped */
uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
- half8 b0 = vload8(0, (__global half *)src.ptr);
+ ushort8 b0 = vload8(0, (__global ushort *)src.ptr);
- vstore8(b0, 0, (__global half *)(dst_ptr + dst_addr_in_bytes));
+ vstore8(b0, 0, (__global ushort *)(dst_ptr + dst_addr_in_bytes));
}
/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
*
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8
* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U8
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_transpose1x16_u8(IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst))
+__kernel void gemm_transpose1x16(IMAGE_DECLARATION(src),
+ IMAGE_DECLARATION(dst))
{
uint x = get_global_id(0);
uint y = get_global_id(1);
@@ -127,7 +131,7 @@
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U32/S32/F32
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
@@ -142,33 +146,33 @@
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
/* Load values from Matrix A */
- float4 a0 = vload4(0, (__global float *)(offset(&src, 0, 0)));
- float4 a1 = vload4(0, (__global float *)(offset(&src, 0, 1)));
- float4 a2 = vload4(0, (__global float *)(offset(&src, 0, 2)));
- float4 a3 = vload4(0, (__global float *)(offset(&src, 0, 3)));
+ uint4 a0 = vload4(0, (__global uint *)(offset(&src, 0, 0)));
+ uint4 a1 = vload4(0, (__global uint *)(offset(&src, 0, 1)));
+ uint4 a2 = vload4(0, (__global uint *)(offset(&src, 0, 2)));
+ uint4 a3 = vload4(0, (__global uint *)(offset(&src, 0, 3)));
- float4 val0 = (float4)(a0.s0, a1.s0, a2.s0, a3.s0);
- vstore4(val0, 0, ((__global float *)dst.ptr) + 0);
+ uint4 val0 = (uint4)(a0.s0, a1.s0, a2.s0, a3.s0);
+ vstore4(val0, 0, ((__global uint *)dst.ptr) + 0);
- val0 = (float4)(a0.s1, a1.s1, a2.s1, a3.s1);
- vstore4(val0, 0, ((__global float *)dst.ptr) + 4);
+ val0 = (uint4)(a0.s1, a1.s1, a2.s1, a3.s1);
+ vstore4(val0, 0, ((__global uint *)dst.ptr) + 4);
- val0 = (float4)(a0.s2, a1.s2, a2.s2, a3.s2);
- vstore4(val0, 0, ((__global float *)dst.ptr) + 8);
+ val0 = (uint4)(a0.s2, a1.s2, a2.s2, a3.s2);
+ vstore4(val0, 0, ((__global uint *)dst.ptr) + 8);
- val0 = (float4)(a0.s3, a1.s3, a2.s3, a3.s3);
- vstore4(val0, 0, ((__global float *)dst.ptr) + 12);
+ val0 = (uint4)(a0.s3, a1.s3, a2.s3, a3.s3);
+ vstore4(val0, 0, ((__global uint *)dst.ptr) + 12);
}
/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
*
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/F16
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U16/S16/F16
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
@@ -183,33 +187,33 @@
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
/* Load values from Matrix A */
- half8 a0 = vload8(0, (__global half *)(offset(&src, 0, 0)));
- half8 a1 = vload8(0, (__global half *)(offset(&src, 0, 1)));
- half8 a2 = vload8(0, (__global half *)(offset(&src, 0, 2)));
- half8 a3 = vload8(0, (__global half *)(offset(&src, 0, 3)));
+ ushort8 a0 = vload8(0, (__global ushort *)(offset(&src, 0, 0)));
+ ushort8 a1 = vload8(0, (__global ushort *)(offset(&src, 0, 1)));
+ ushort8 a2 = vload8(0, (__global ushort *)(offset(&src, 0, 2)));
+ ushort8 a3 = vload8(0, (__global ushort *)(offset(&src, 0, 3)));
- half8 val0 = (half8)((half4)(a0.s0, a1.s0, a2.s0, a3.s0), (half4)(a0.s1, a1.s1, a2.s1, a3.s1));
- vstore8(val0, 0, ((__global half *)dst.ptr) + 0);
+ ushort8 val0 = (ushort8)((ushort4)(a0.s0, a1.s0, a2.s0, a3.s0), (ushort4)(a0.s1, a1.s1, a2.s1, a3.s1));
+ vstore8(val0, 0, ((__global ushort *)dst.ptr) + 0);
- val0 = (half8)((half4)(a0.s2, a1.s2, a2.s2, a3.s2), (half4)(a0.s3, a1.s3, a2.s3, a3.s3));
- vstore8(val0, 0, ((__global half *)dst.ptr) + 8);
+ val0 = (ushort8)((ushort4)(a0.s2, a1.s2, a2.s2, a3.s2), (ushort4)(a0.s3, a1.s3, a2.s3, a3.s3));
+ vstore8(val0, 0, ((__global ushort *)dst.ptr) + 8);
- val0 = (half8)((half4)(a0.s4, a1.s4, a2.s4, a3.s4), (half4)(a0.s5, a1.s5, a2.s5, a3.s5));
- vstore8(val0, 0, ((__global half *)dst.ptr) + 16);
+ val0 = (ushort8)((ushort4)(a0.s4, a1.s4, a2.s4, a3.s4), (ushort4)(a0.s5, a1.s5, a2.s5, a3.s5));
+ vstore8(val0, 0, ((__global ushort *)dst.ptr) + 16);
- val0 = (half8)((half4)(a0.s6, a1.s6, a2.s6, a3.s6), (half4)(a0.s7, a1.s7, a2.s7, a3.s7));
- vstore8(val0, 0, ((__global half *)dst.ptr) + 24);
+ val0 = (ushort8)((ushort4)(a0.s6, a1.s6, a2.s6, a3.s6), (ushort4)(a0.s7, a1.s7, a2.s7, a3.s7));
+ vstore8(val0, 0, ((__global ushort *)dst.ptr) + 24);
}
/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
*
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8
* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U8/S8
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
@@ -248,65 +252,47 @@
/** This kernel accumulates each row with the biases vector
*
- * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F32
+ * @note The data type must be passed at compile time -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: U8/S8/QS8/U16/S16/F16/U32/S32/F32
* @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)
* @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
* @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor
- * @param[in] biases_ptr Pointer to the biases vector. Same as input.
+ * @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr
* @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
-__kernel void gemm_accumulate_biases_f32(
+#ifdef DATA_TYPE
+__kernel void gemm_accumulate_biases(
IMAGE_DECLARATION(accum),
VECTOR_DECLARATION(biases))
{
Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
- float4 accum_value = vload4(0, (__global float *)accum.ptr);
- float4 biases_value = vload4(0, (__global float *)biases.ptr);
- accum_value = biases_value + accum_value;
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ accum_value = vload16(0, (__global DATA_TYPE *)accum.ptr);
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ biases_value = vload16(0, (__global DATA_TYPE *)biases.ptr);
+#ifdef FIXED_POINT_POSITION
+ accum_value = ADD_SAT_OP_EXPAND(biases_value, accum_value, DATA_TYPE, 16);
+#else // FIXED_POINT_POSITION
+ accum_value = biases_value + accum_value;
+#endif // FIXED_POINT_POSITION
// Store result in the accummulate buffer
- vstore4(accum_value, 0, (__global float *)accum.ptr);
+ vstore16(accum_value, 0, (__global DATA_TYPE *)accum.ptr);
}
+#endif /* DATA_TYPE */
-/** This kernel accumulates each row with the biases vector
- *
- * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F16
- * @param[in] accum_stride_x Stride of the accumulate tensor in X dimension (in bytes)
- * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
- * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor
- * @param[in] biases_ptr Pointer to the biases vector. Same as input.
- * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void gemm_accumulate_biases_f16(
- IMAGE_DECLARATION(accum),
- VECTOR_DECLARATION(biases))
-{
- Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
- Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
-
- half8 accum_value = vload8(0, (__global half *)accum.ptr);
- half8 biases_value = vload8(0, (__global half *)biases.ptr);
- accum_value = biases_value + accum_value;
-
- // Store result in the accummulate buffer
- vstore8(accum_value, 0, (__global half *)accum.ptr);
-}
-
-#if(defined WIDTH_MATRIX_B)
+#ifdef COLS_B
/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_u8 and @ref gemm_transpose1x16_u8 before running the matrix multiplication
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
*
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DCOLS_B
*
* @param[in] src0_ptr Pointer to the source matrix. Supported formats: U8
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -314,13 +300,13 @@
* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in] src1_ptr Pointer to the source matrix. Supported formats: U8
+ * @param[in] src1_ptr Pointer to the source matrix. Supported formats: same as @p src0_ptr
* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported formats: U8
+ * @param[out] dst_ptr Pointer to the destination matrix Supported formats: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
@@ -332,14 +318,14 @@
* @param[in] c_mult_int Multiplied with each element of the matrix C.
* @param[in] shift Number of bits to shift right the result.
*/
-__kernel void gemm_mm_u8(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst),
- int a_offset,
- int b_offset,
- int c_offset,
- int c_mult_int,
- int shift)
+__kernel void gemm_mm_interleaved_transposed_u8(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst),
+ int a_offset,
+ int b_offset,
+ int c_offset,
+ int c_mult_int,
+ int shift)
{
/* src_addr.s0 = address of matrix A */
/* src_addr.s1 = address of matrix B */
@@ -352,7 +338,7 @@
src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
/* Compute end row address for matrix B */
- int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+ int end_row_mtx_b = src_addr.s1 + COLS_B;
/* Reset accumulators */
int16 c00 = 0.0f;
@@ -406,13 +392,13 @@
vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
}
-#endif
+#endif /* COLS_B */
-#if(defined WIDTH_MATRIX_B && defined ALPHA)
+#if defined(COLS_B) && defined(ALPHA)
/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
*
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -420,22 +406,22 @@
* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_mm_f32_midgard(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+__kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst))
{
/* src_addr.s0 = address of matrix A */
/* src_addr.s1 = address of matrix B */
@@ -451,7 +437,7 @@
src_addr = src_addr >> 2;
/* Compute end row address for matrix B */
- int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+ int end_row_mtx_b = src_addr.s1 + COLS_B;
/* Reset accumulators */
float4 c00 = 0.0f;
@@ -509,9 +495,9 @@
}
/** This OpenCL kernel is optimised for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
*
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -519,22 +505,22 @@
* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_mm_f32_bifrost(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst))
{
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
@@ -542,7 +528,7 @@
__global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
// Compute end row address for matrix B
- __global float *src_end_addr_b = src_addr_b + WIDTH_MATRIX_B;
+ __global float *src_end_addr_b = src_addr_b + COLS_B;
// Reset accumulators
float c00 = 0.0f;
@@ -719,9 +705,9 @@
}
/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f16 and @ref gemm_transpose1x8_f16 before running the matrix multiplication
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
*
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -729,22 +715,22 @@
* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F16
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_mm_f16(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst))
{
/* src_addr.s0 = address of matrix A */
/* src_addr.s1 = address of matrix B */
@@ -760,7 +746,7 @@
src_addr = src_addr >> 1;
/* Compute end row address for matrix B */
- int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+ int end_row_mtx_b = src_addr.s1 + COLS_B;
/* Reset accumulators */
half8 c00 = 0.0f;
@@ -768,7 +754,7 @@
half8 c20 = 0.0f;
half8 c30 = 0.0f;
- for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 16))
+ for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(8, 16))
{
/* Load values from matrix A (interleaved) and matrix B (transposed) */
half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
@@ -817,146 +803,634 @@
vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
}
-#if(defined WIDTH_VECTOR_A)
-/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
+#ifdef FIXED_POINT_POSITION
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 8 bit fixed point precision
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
*
- * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
+ * @attention The width of matrix B, the alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION
*
- * @attention The input vector A and matrix B must not be reshaped
+ * @note: ALPHA must be passed in 8 bit fixed point format
*
- * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_vm_f32(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+__kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst))
{
- int idx = get_global_id(0) * 4;
+ /* src_addr.s0 = address of matrix A */
+ /* src_addr.s1 = address of matrix B */
- /* Compute the address for the vector A and matrix B */
- int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
- src_addr.s1 += idx * sizeof(float);
+ /* Compute address for matrix A and B */
+ int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+ (src1_stride_y));
- int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+ /* Add offset_first_element_in_bytes */
+ src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
- float4 acc = 0.0f;
+ /* Compute end row address for matrix B */
+ int end_row_mtx_b = src_addr.s1 + COLS_B;
- for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+ /* Reset accumulators */
+ short8 c00 = 0.0f;
+ short8 c10 = 0.0f;
+ short8 c20 = 0.0f;
+ short8 c30 = 0.0f;
+ short8 c01 = 0.0f;
+ short8 c11 = 0.0f;
+ short8 c21 = 0.0f;
+ short8 c31 = 0.0f;
+
+ /* This for loop performs 1 accumulation for each iteration */
+ for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(4, 16))
{
- float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
- float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
- float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+ /* Load values from matrix A (interleaved) and matrix B (transposed) */
+ char4 a0 = vload4(0, ((__global char *)src0_ptr) + src_addr.s0);
+ char16 b0 = vload16(0, ((__global char *)src1_ptr) + src_addr.s1);
- acc += b0 * (float4)a0.s0;
- acc += b1 * (float4)a0.s1;
- }
+ c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
+ c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION);
+ c20 = mlal_sat_qs8x8(c20, (char8)a0.s2, b0.s01234567, FIXED_POINT_POSITION);
+ c30 = mlal_sat_qs8x8(c30, (char8)a0.s3, b0.s01234567, FIXED_POINT_POSITION);
- for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
- {
- float a0 = *((__global float *)(src0_ptr + src_addr.s0));
- float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-
- acc += b0 * (float4)a0;
+ c01 = mlal_sat_qs8x8(c01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
+ c11 = mlal_sat_qs8x8(c11, (char8)a0.s1, b0.s89ABCDEF, FIXED_POINT_POSITION);
+ c21 = mlal_sat_qs8x8(c21, (char8)a0.s2, b0.s89ABCDEF, FIXED_POINT_POSITION);
+ c31 = mlal_sat_qs8x8(c31, (char8)a0.s3, b0.s89ABCDEF, FIXED_POINT_POSITION);
}
/* Compute destination address */
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
- /* Multiply by the weight of vector-matrix product */
- acc = acc * (float4)ALPHA;
+ /* Multiply by the weight of matrix product */
+ char16 c00_qs8 = convert_char16_sat((short16)(c00, c01));
+ char16 c10_qs8 = convert_char16_sat((short16)(c10, c11));
+ char16 c20_qs8 = convert_char16_sat((short16)(c20, c21));
+ char16 c30_qs8 = convert_char16_sat((short16)(c30, c31));
- vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+ c00_qs8 = mul_sat_qs8x16(c00_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+ c10_qs8 = mul_sat_qs8x16(c10_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+ c20_qs8 = mul_sat_qs8x16(c20_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+ c30_qs8 = mul_sat_qs8x16(c30_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+
+ /* Store 16x4 block */
+ vstore16(c00_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
+ vstore16(c10_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
+ vstore16(c20_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
+ vstore16(c30_qs8, 0, (__global char *)(offset(&dst, 0, 3)));
}
-/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
*
- * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
+ * @attention The width of matrix B, the alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION
*
- * @attention The input vector A and matrix B must not be reshaped
+ * @note: ALPHA must be passed in 16 bit fixed point format
*
- * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F16
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-__kernel void gemm_vm_f16(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+__kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst))
{
- int idx = get_global_id(0) * 8;
+ /* src_addr.s0 = address of matrix A */
+ /* src_addr.s1 = address of matrix B */
- /* Compute the address for the vector A and matrix B */
- int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
- src_addr.s1 += idx * sizeof(half);
+ /* Compute address for matrix A and B */
+ int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+ (src1_stride_y));
- int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(half));
+ /* Add offset_first_element_in_bytes */
+ src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
- half8 acc = 0.0f;
+ /* Divide by 2 in order to get the src_addr in unit of short */
+ src_addr = src_addr >> 1;
- for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(half)); src_addr += (int2)(4 * sizeof(half), 4 * src1_stride_y))
+ /* Compute end row address for matrix B */
+ int end_row_mtx_b = src_addr.s1 + COLS_B;
+
+ /* Reset accumulators */
+ int8 c00 = 0.0f;
+ int8 c10 = 0.0f;
+ int8 c20 = 0.0f;
+ int8 c30 = 0.0f;
+
+ /* This for loop performs 1 accumulation for each iteration */
+ for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(4, 8))
{
- half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0));
- half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
- half8 b1 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
- half8 b2 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
- half8 b3 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
+ /* Load values from matrix A (interleaved) and matrix B (transposed) */
+ short4 a0 = vload4(0, ((__global short *)src0_ptr) + src_addr.s0);
+ short8 b0 = vload8(0, ((__global short *)src1_ptr) + src_addr.s1);
- acc += b0 * (half8)a0.s0;
- acc += b1 * (half8)a0.s1;
- acc += b2 * (half8)a0.s2;
- acc += b3 * (half8)a0.s3;
- }
-
- for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(half), src1_stride_y))
- {
- half a0 = *((__global half *)(src0_ptr + src_addr.s0));
- half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-
- acc += b0 * (half8)a0;
+ c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION);
+ c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION);
+ c20 = mlal_sat_qs16x8(c20, (short8)a0.s2, b0, FIXED_POINT_POSITION);
+ c30 = mlal_sat_qs16x8(c30, (short8)a0.s3, b0, FIXED_POINT_POSITION);
}
/* Compute destination address */
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
- /* Multiply by the weight of vector-matrix product */
- acc = acc * (half8)ALPHA;
+ /* Multiply by the weight of matrix product */
+ short8 c00_qs16 = convert_short8_sat(c00);
+ short8 c10_qs16 = convert_short8_sat(c10);
+ short8 c20_qs16 = convert_short8_sat(c20);
+ short8 c30_qs16 = convert_short8_sat(c30);
- vstore8(acc, 0, (__global half *)(offset(&dst, 0, 0)));
+ c00_qs16 = mul_sat_qs16x8(c00_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+ c10_qs16 = mul_sat_qs16x8(c10_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+ c20_qs16 = mul_sat_qs16x8(c20_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+ c30_qs16 = mul_sat_qs16x8(c30_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+
+ /* Store 8x4 block */
+ vstore8(c00_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
+ vstore8(c10_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
+ vstore8(c20_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
+ vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
}
-#endif /* (defined WIDTH_VECTOR_A) */
-#endif /* (defined WIDTH_MATRIX_B && defined ALPHA) */
+#endif // defined(FIXED_POINT_POSITION)
-#if(defined BETA)
+#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+#if defined(DATA_TYPE)
+#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with floating point data types (F16/F32)
+ * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
+ * @note The width of matrix A and the alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst))
+{
+ int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+ // Compute starting address for matrix A and Matrix B
+ int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+ // Update address for the matrix A
+ src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+ // Update address for the matrix B
+ src_addr.s1 += idx * sizeof(DATA_TYPE);
+
+ int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
+
+ VECTOR_TYPE acc0 = 0.0f;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ VECTOR_TYPE acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ VECTOR_TYPE acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ VECTOR_TYPE acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
+ {
+ // Load values from matrix A
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ // Load values from matrix B
+ VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+ VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+ // Accumulate
+ acc0 += b0 * (VECTOR_TYPE)a0.s0;
+ acc0 += b1 * (VECTOR_TYPE)a0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += b0 * (VECTOR_TYPE)a1.s0;
+ acc1 += b1 * (VECTOR_TYPE)a1.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += b0 * (VECTOR_TYPE)a2.s0;
+ acc2 += b1 * (VECTOR_TYPE)a2.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += b0 * (VECTOR_TYPE)a3.s0;
+ acc3 += b1 * (VECTOR_TYPE)a3.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
+ {
+ // Load values from matrix A
+ DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ // Load values from matrix B
+ VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+
+ // Accumulate
+ acc0 += b0 * (VECTOR_TYPE)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += b0 * (VECTOR_TYPE)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += b0 * (VECTOR_TYPE)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += b0 * (VECTOR_TYPE)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Multiply by the weight of matrix-matrix product and store the result
+ acc0 = acc0 * (VECTOR_TYPE)ALPHA;
+ VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+ (acc0, 0, (__global DATA_TYPE *)(offset(&dst, 0, 0)));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = acc1 * (VECTOR_TYPE)ALPHA;
+ VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+ (acc1, 0, (__global DATA_TYPE *)(offset(&dst, 0, 1)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = acc2 * (VECTOR_TYPE)ALPHA;
+ VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+ (acc2, 0, (__global DATA_TYPE *)(offset(&dst, 0, 2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = acc3 * (VECTOR_TYPE)ALPHA;
+ VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+ (acc3, 0, (__global DATA_TYPE *)(offset(&dst, 0, 3)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#endif // defined(DATA_TYPE)
+
+#ifdef FIXED_POINT_POSITION
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with fixed point data types QS8
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
+ * @note The width of matrix A, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
+ * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
+ * @note The alpha value must be passed in 8 bit fixed point format using -DALPHA
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst))
+{
+ int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+ // Compute starting address for matrix A and Matrix B
+ int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+ // Update address for the matrix A
+ src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+ // Update address for the matrix B
+ src_addr.s1 += idx * sizeof(char);
+
+ int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(char));
+
+ short8 acc00 = 0;
+ short8 acc01 = 0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ short8 acc10 = 0;
+ short8 acc11 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ short8 acc20 = 0;
+ short8 acc21 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ short8 acc30 = 0;
+ short8 acc31 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ // This for loop performs 4 accumulations per iteration
+ for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
+ {
+ char2 a0 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ char2 a1 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ char2 a2 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ char2 a3 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
+ char16 b1 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
+
+ acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
+ acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s1, b1.s01234567, FIXED_POINT_POSITION);
+ acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
+ acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s0, b0.s01234567, FIXED_POINT_POSITION);
+ acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s1, b1.s01234567, FIXED_POINT_POSITION);
+ acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
+ acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s0, b0.s01234567, FIXED_POINT_POSITION);
+ acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s1, b1.s01234567, FIXED_POINT_POSITION);
+ acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
+ acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s0, b0.s01234567, FIXED_POINT_POSITION);
+ acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s1, b1.s01234567, FIXED_POINT_POSITION);
+ acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);
+ acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ // Left-over accumulations
+ for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
+ {
+ char a0 = *((__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ char a1 = *((__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ char a2 = *((__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ char a3 = *((__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1));
+
+ acc00 = mlal_sat_qs8x8(acc00, (char8)a0, b0.s01234567, FIXED_POINT_POSITION);
+ acc01 = mlal_sat_qs8x8(acc01, (char8)a0, b0.s89ABCDEF, FIXED_POINT_POSITION);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc10 = mlal_sat_qs8x8(acc10, (char8)a1, b0.s01234567, FIXED_POINT_POSITION);
+ acc11 = mlal_sat_qs8x8(acc11, (char8)a1, b0.s89ABCDEF, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc20 = mlal_sat_qs8x8(acc20, (char8)a2, b0.s01234567, FIXED_POINT_POSITION);
+ acc21 = mlal_sat_qs8x8(acc21, (char8)a2, b0.s89ABCDEF, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc30 = mlal_sat_qs8x8(acc30, (char8)a3, b0.s01234567, FIXED_POINT_POSITION);
+ acc31 = mlal_sat_qs8x8(acc31, (char8)a3, b0.s89ABCDEF, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Multiply by the weight of matrix product and store the result
+ char16 acc_qs8;
+ acc_qs8 = convert_char16_sat((short16)(acc00, acc01));
+ acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+ vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc_qs8 = convert_char16_sat((short16)(acc10, acc11));
+ acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+ vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc_qs8 = convert_char16_sat((short16)(acc20, acc21));
+ acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+ vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc_qs8 = convert_char16_sat((short16)(acc30, acc31));
+ acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+ vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 3)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with fixed point data types QS16
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
+ * @note The width of matrix A, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
+ * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
+ * @note The alpha value must be passed in 16 bit fixed point format using -DALPHA
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst))
+{
+ int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+ // Compute starting address for matrix A and Matrix B
+ int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+ // Update address for the matrix A
+ src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+ // Update address for the matrix B
+ src_addr.s1 += idx * sizeof(short);
+
+ int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(short));
+
+ int8 acc0 = 0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ int8 acc1 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ int8 acc2 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ int8 acc3 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ // This for loop performs 4 accumulations per iteration
+ for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(short)); src_addr += (int2)(2 * sizeof(short), 2 * src1_stride_y))
+ {
+ short2 a0 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ short2 a1 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ short2 a2 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ short2 a3 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
+ short8 b1 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
+
+ acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s0, b0, FIXED_POINT_POSITION);
+ acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s1, b1, FIXED_POINT_POSITION);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s0, b0, FIXED_POINT_POSITION);
+ acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s1, b1, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s0, b0, FIXED_POINT_POSITION);
+ acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s1, b1, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s0, b0, FIXED_POINT_POSITION);
+ acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s1, b1, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ // Left-over accumulations
+ for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(short), src1_stride_y))
+ {
+ short a0 = *((__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ short a1 = *((__global short *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ short a2 = *((__global short *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ short a3 = *((__global short *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1));
+
+ acc0 = mlal_sat_qs16x8(acc0, (short8)a0, b0, FIXED_POINT_POSITION);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = mlal_sat_qs16x8(acc1, (short8)a1, b0, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = mlal_sat_qs16x8(acc2, (short8)a2, b0, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = mlal_sat_qs16x8(acc3, (short8)a3, b0, FIXED_POINT_POSITION);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Multiply by the weight of matrix product and store the result
+ short8 acc_qs16;
+ acc_qs16 = convert_short8_sat(acc0);
+ acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+ vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc_qs16 = convert_short8_sat(acc1);
+ acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+ vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc_qs16 = convert_short8_sat(acc2);
+ acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+ vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc_qs16 = convert_short8_sat(acc3);
+ acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+ vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#endif // defined(FIXED_POINT_POSITION)
+#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+#endif // defined(COLS_B) && defined(ALPHA)
+
+#ifdef BETA
/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
*
* @attention The beta's value need to be passed at compile time using -DBETA
@@ -967,7 +1441,7 @@
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
@@ -996,13 +1470,15 @@
/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
*
+ * @attention The beta's value need to be passed at compile time using -DBETA
+ *
* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
@@ -1028,9 +1504,89 @@
/* Store final result in axb matrix */
vstore8(out, 0, (__global half *)dst.ptr);
}
-#endif /* (defined BETA) */
-#if(defined WIDTH_VECTOR_A)
+#ifdef FIXED_POINT_POSITION
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 8 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
+ *
+ * @note: BETA must be passed in 8 bit fixed point format
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: QS8
+ * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_qs8(IMAGE_DECLARATION(src),
+ IMAGE_DECLARATION(dst))
+{
+ /* Compute source and destination addresses */
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ /* Load values from A x B */
+ char16 alpha_ab = vload16(0, (__global char *)dst.ptr);
+
+ /* Load values from Matrix C */
+ char16 c = vload16(0, (__global char *)src.ptr);
+
+ /* Computes alpha * axb + beta * c */
+ char16 out = mla_sat_qs8x16(alpha_ab, (char16)BETA, c, FIXED_POINT_POSITION);
+
+ /* Store final result in axb matrix */
+ vstore16(out, 0, (__global char *)dst.ptr);
+}
+
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 16 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
+ *
+ * @note: BETA must be passed in 16 bit fixed point format
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: QS16
+ * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_qs16(IMAGE_DECLARATION(src),
+ IMAGE_DECLARATION(dst))
+{
+ /* Compute source and destination addresses */
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ /* Load values from A x B */
+ short8 alpha_ab = vload8(0, (__global short *)dst.ptr);
+
+ /* Load values from Matrix C */
+ short8 c = vload8(0, (__global short *)src.ptr);
+
+ /* Computes alpha * axb + beta * c */
+ short8 out = mla_sat_qs16x8(alpha_ab, (short8)BETA, c, FIXED_POINT_POSITION);
+
+ /* Store final result in axb matrix */
+ vstore8(out, 0, (__global short *)dst.ptr);
+}
+#endif /* defined(FIXED_POINT_POSITION) */
+#endif /* defined(BETA) */
+
+#ifdef WIDTH_VECTOR_A
/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
*
* @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
@@ -1043,7 +1599,7 @@
* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
@@ -1051,7 +1607,7 @@
* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
@@ -1096,4 +1652,4 @@
vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
}
-#endif /* (defined WIDTH_VECTOR_A) */
+#endif /* WIDTH_VECTOR_A */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/gemv.cl
new file mode 100644
index 0000000..76128f7
--- /dev/null
+++ b/src/core/CL/cl_kernels/gemv.cl
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This kernel applies dot product to each plane on the input tensor and the corrispective column of the reshaped weight tensor.
+ *
+ * @note Datatype and source width and height should be given as a preprocessor argument using -DDATA_TYPE=type, -DSRC_WIDTH=width and -DSRC_HEIGHT=height. e.g. -DDATA_TYPE=short
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] weights_ptr Pointer to the weights tensor. Same as @p src_ptr
+ * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VECTOR_DECLARATION(dst))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+ int y = get_global_id(1) * 4;
+ int z = get_global_id(2);
+
+ __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
+ __global uchar *input_ptr = src.ptr;
+
+ DATA_TYPE acc0 = (DATA_TYPE)0;
+ DATA_TYPE acc1 = (DATA_TYPE)0;
+ DATA_TYPE acc2 = (DATA_TYPE)0;
+ DATA_TYPE acc3 = (DATA_TYPE)0;
+
+ // This kernel handle 4 rows in per thread so that it can reuse the weights
+ for(int i = 0; i < SRC_WIDTH; i += 4)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ weights = vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x));
+
+ int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
+
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp0 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp1 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp2 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ tmp3 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3));
+
+ acc0 += dot(weights, tmp0);
+ acc1 += dot(weights, tmp1);
+ acc2 += dot(weights, tmp2);
+ acc3 += dot(weights, tmp3);
+ }
+
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
+
+ int rows_left = SRC_HEIGHT - (y + 4);
+
+ // This if check is used to handle the last few rows when it can't be divided by the four
+ if(rows_left >= 0)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out = (VEC_DATA_TYPE(DATA_TYPE, 4))(acc0, acc1, acc2, acc3);
+ vstore4(out, 0, (__global DATA_TYPE *)output_ptr);
+ }
+ else
+ {
+ switch(rows_left)
+ {
+ case -1: // three rows left; one is padding
+ *((__global DATA_TYPE *)(output_ptr + 2 * dst_stride_x)) = acc2;
+ case -2: // two rows left; two are padding
+ *((__global DATA_TYPE *)(output_ptr + 1 * dst_stride_x)) = acc1;
+ case -3: // one row left; three are padding
+ *((__global DATA_TYPE *)(output_ptr + 0 * dst_stride_x)) = acc0;
+ break;
+ }
+ }
+}
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 6db8ed5..68af64e 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -26,8 +26,16 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#define EXPAND(x) x
+
#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+#define VLOAD_STR(size) vload##size
+#define VLOAD(size) VLOAD_STR(size)
+
+#define VSTORE_STR(size) vstore##size
+#define VSTORE(size) VSTORE_STR(size)
+
#define VEC_DATA_TYPE_STR(type, size) type##size
#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
@@ -64,6 +72,18 @@
uint name##_step_z, \
uint name##_offset_first_element_in_bytes
+#define TENSOR4D_DECLARATION(name) \
+ __global uchar *name##_ptr, \
+ uint name##_stride_x, \
+ uint name##_step_x, \
+ uint name##_stride_y, \
+ uint name##_step_y, \
+ uint name##_stride_z, \
+ uint name##_step_z, \
+ uint name##_stride_w, \
+ uint name##_step_w, \
+ uint name##_offset_first_element_in_bytes
+
#define CONVERT_TO_VECTOR_STRUCT(name) \
update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
@@ -76,6 +96,15 @@
#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+ update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+ update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+ update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
#define CONVERT_TO_TENSOR3D_STRUCT(name) \
update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
name##_stride_z, name##_step_z)
@@ -83,6 +112,13 @@
#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \
+ update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+ name##_stride_z, name##_step_z, name##_stride_w, name##_step_z, mod_size)
+
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
+ update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+
/** Structure to hold Vector information */
typedef struct Vector
{
@@ -110,6 +146,17 @@
int stride_z; /**< Stride of the image in Z dimension (in bytes) */
} Tensor3D;
+/** Structure to hold 4D tensor information */
+typedef struct Tensor4D
+{
+ __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
+ int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+ int stride_x; /**< Stride of the image in X dimension (in bytes) */
+ int stride_y; /**< Stride of the image in Y dimension (in bytes) */
+ int stride_z; /**< Stride of the image in Z dimension (in bytes) */
+ int stride_w; /**< Stride of the image in W dimension (in bytes) */
+} Tensor4D;
+
/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
*
* @param[in] ptr Pointer to the starting postion of the buffer
@@ -155,6 +202,32 @@
return img;
}
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+ Image img =
+ {
+ .ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y
+ };
+ img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+ return img;
+}
+
/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
*
* @param[in] ptr Pointer to the starting postion of the buffer
@@ -182,6 +255,24 @@
return tensor;
}
+Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+ uint step_w,
+ uint mod_size)
+{
+ Tensor4D tensor =
+ {
+ .ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y,
+ .stride_z = stride_z,
+ .stride_w = stride_w
+ };
+
+ tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+ return tensor;
+}
+
/** Get the pointer position of a Vector
*
* @param[in] vec Pointer to the starting position of the buffer
@@ -205,7 +296,7 @@
/** Get the pointer position of a Tensor3D
*
- * @param[in] tensor Pointer to the starting postion of the buffer
+ * @param[in] tensor Pointer to the starting position of the buffer
* @param[in] x Relative X position
* @param[in] y Relative Y position
* @param[in] z Relative Z position
@@ -215,4 +306,17 @@
return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
}
+/** Get the pointer position of a Tensor4D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] x Relative X position
+ * @param[in] y Relative Y position
+ * @param[in] z Relative Z position
+ * @param[in] w Relative W position
+ */
+__global inline const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+{
+ return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
+}
+
#endif // _HELPER_H
diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl
index 31dd57b..5d3a607 100644
--- a/src/core/CL/cl_kernels/hog.cl
+++ b/src/core/CL/cl_kernels/hog.cl
@@ -24,7 +24,7 @@
#include "helpers.h"
#include "types.h"
-#if(defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE)
+#if defined(CELL_WIDTH) && defined(CELL_HEIGHT) && defined(NUM_BINS) && defined(PHASE_SCALE)
/** This OpenCL kernel computes the HOG orientation binning
*
@@ -159,21 +159,21 @@
((__global float *)dst.ptr)[xc] = bins[xc];
}
}
-#endif // (defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE)
+#endif /* CELL_WIDTH and CELL_HEIGHT and NUM_BINS and PHASE_SCALE */
-#if(defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD)
+#if defined(NUM_CELLS_PER_BLOCK_HEIGHT) && defined(NUM_BINS_PER_BLOCK_X) && defined(NUM_BINS_PER_BLOCK) && defined(HOG_NORM_TYPE) && defined(L2_HYST_THRESHOLD)
#ifndef L2_NORM
#error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
-#endif
+#endif /* not L2_NORM */
#ifndef L2HYS_NORM
#error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
-#endif
+#endif /* not L2HYS_NORM */
#ifndef L1_NORM
#error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
-#endif
+#endif /* not L1_NORM */
/** This OpenCL kernel computes the HOG block normalization
*
@@ -231,13 +231,13 @@
sum_f32 += val1 * val1;
sum_f32 += val2 * val2;
sum_f32 += val3 * val3;
-#else
+#else /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
// Compute |val| for L1_NORM
sum_f32 += fabs(val0);
sum_f32 += fabs(val1);
sum_f32 += fabs(val2);
sum_f32 += fabs(val3);
-#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
// Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
// This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
@@ -255,9 +255,9 @@
#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
sum += val * val;
-#else
+#else /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
sum += fabs(val);
-#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
}
@@ -322,7 +322,7 @@
// We use the same constants of OpenCV
scale = 1.0f / (sqrt(sum) + 1e-3f);
-#endif // (HOG_NORM_TYPE == L2HYS_NORM)
+#endif /* (HOG_NORM_TYPE == L2HYS_NORM) */
int i = 0;
for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
@@ -349,9 +349,9 @@
((__global float *)dst.ptr)[i] *= scale;
}
}
-#endif // (defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD)
+#endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */
-#if(defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && defined THRESHOLD && defined MAX_NUM_DETECTION_WINDOWS && defined IDX_CLASS && defined BLOCK_STRIDE_WIDTH && defined BLOCK_STRIDE_HEIGHT && defined DETECTION_WINDOW_WIDTH && defined DETECTION_WINDOW_HEIGHT)
+#if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(BLOCK_STRIDE_WIDTH) && defined(BLOCK_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)
/** This OpenCL kernel computes the HOG detector using linear SVM
*
@@ -452,4 +452,5 @@
}
}
}
-#endif // defined BIAS && defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && ...
+#endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS &&
+ * BLOCK_STRIDE_WIDTH && BLOCK_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */
diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/l2_normalize.cl
new file mode 100644
index 0000000..8d47631
--- /dev/null
+++ b/src/core/CL/cl_kernels/l2_normalize.cl
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This kernel performs reduction given an operation.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_ptr Pointer to the source tensor. Supported data types: QS8/F16/F32
+ * @param[in] sum_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] epsilon Epsilon value
+ */
+__kernel void l2_normalize(
+ VECTOR_DECLARATION(src),
+ VECTOR_DECLARATION(sum),
+ VECTOR_DECLARATION(dst),
+ DATA_TYPE epsilon)
+{
+ Vector src = CONVERT_TO_VECTOR_STRUCT(src);
+ Vector sum = CONVERT_TO_VECTOR_STRUCT(sum);
+ Vector dst = CONVERT_TO_VECTOR_STRUCT(dst);
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ in = vload16(0, (__global DATA_TYPE *)src.ptr);
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))native_rsqrt(fmax(((__global DATA_TYPE *)sum.ptr)[0], epsilon));
+
+ vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl
index c4b0df8..e9845e0 100644
--- a/src/core/CL/cl_kernels/magnitude_phase.cl
+++ b/src/core/CL/cl_kernels/magnitude_phase.cl
@@ -81,17 +81,17 @@
#define MAGNITUDE_OP(x, y) magnitude_l1((x), (y))
#elif(2 == MAGNITUDE)
#define MAGNITUDE_OP(x, y) magnitude_l2(convert_int16(x), convert_int16(y))
-#else
+#else /* MAGNITUDE */
#define MAGNITUDE_OP(x, y)
-#endif
+#endif /* MAGNITUDE */
#if(1 == PHASE)
#define PHASE_OP(x, y) phase_unsigned((x), (y))
#elif(2 == PHASE)
#define PHASE_OP(x, y) phase_signed((x), (y))
-#else
+#else /* PHASE */
#define PHASE_OP(x, y)
-#endif
+#endif /* PHASE */
/** Calculate the magnitude and phase of given the gradients of an image.
*
@@ -133,11 +133,11 @@
#ifdef MAGNITUDE
,
IMAGE_DECLARATION(magnitude)
-#endif
+#endif /* MAGNITUDE */
#ifdef PHASE
,
IMAGE_DECLARATION(phase)
-#endif
+#endif /* PHASE */
)
{
// Get pixels pointer
@@ -154,9 +154,9 @@
#ifdef MAGNITUDE
Image magnitude = CONVERT_TO_IMAGE_STRUCT(magnitude);
vstore16(MAGNITUDE_OP(in_a, in_b), 0, (__global DATA_TYPE *)magnitude.ptr);
-#endif
+#endif /* MAGNITUDE */
#ifdef PHASE
Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
vstore16(PHASE_OP(in_a, in_b), 0, phase.ptr);
-#endif
+#endif /* PHASE */
}
diff --git a/src/core/CL/cl_kernels/mean_stddev.cl b/src/core/CL/cl_kernels/mean_stddev.cl
index 50b8312..7c29d2f 100644
--- a/src/core/CL/cl_kernels/mean_stddev.cl
+++ b/src/core/CL/cl_kernels/mean_stddev.cl
@@ -44,19 +44,19 @@
IMAGE_DECLARATION(src),
uint height,
__global ulong *global_sum
-#if defined STDDEV
+#ifdef STDDEV
,
__global ulong *global_sum_sq
-#endif
+#endif /* STDDEV */
)
{
// Get pixels pointer
Image src = CONVERT_TO_IMAGE_STRUCT(src);
- uint8 tmp_sum = 0;
-#if defined STDDEV
- uint8 tmp_sum_sq = 0;
-#endif
+ uint8 tmp_sum = 0;
+#ifdef STDDEV
+ uint8 tmp_sum_sq = 0;
+#endif /* STDDEV */
// Calculate partial sum
for(int i = 0; i < height; i++)
{
@@ -64,20 +64,20 @@
uint8 data = convert_uint8(vload8(0, offset(&src, 0, i)));
tmp_sum += data;
-#if defined STDDEV
+#ifdef STDDEV
tmp_sum_sq += data * data;
-#endif
+#endif /* STDDEV */
}
// Perform reduction
tmp_sum.s0123 += tmp_sum.s4567;
tmp_sum.s01 += tmp_sum.s23;
atom_add(global_sum, tmp_sum.s0 + tmp_sum.s1);
-#if defined STDDEV
+#ifdef STDDEV
tmp_sum_sq.s0123 += tmp_sum_sq.s4567;
tmp_sum_sq.s01 += tmp_sum_sq.s23;
atom_add(global_sum_sq, tmp_sum_sq.s0 + tmp_sum_sq.s1);
-#endif
+#endif /* STDDEV */
}
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable
diff --git a/src/core/CL/cl_kernels/minmax_layer.cl b/src/core/CL/cl_kernels/minmax_layer.cl
new file mode 100644
index 0000000..1e543b4
--- /dev/null
+++ b/src/core/CL/cl_kernels/minmax_layer.cl
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
+/** This function identifies the min and maximum value of an input 3D tensor.
+ *
+ * @note The width, height and depth of the input tensor must be provided at compile time using -DWIDTH, -DHEIGHT and -DDEPTH (e.g. -DWIDTH=320, -DHEIGHT=240, -DDEPTH=3)
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source image in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] dst_ptr Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Supported data types: F32.
+ * @param[in] dst_stride_x Stride of the min/max vector in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ */
+__kernel void minmax_layer(
+ TENSOR3D_DECLARATION(src),
+ VECTOR_DECLARATION(dst))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Vector dst = CONVERT_TO_VECTOR_STRUCT(dst);
+
+ float4 min_value = (float4)FLT_MAX;
+ float4 max_value = (float4) - FLT_MAX;
+ float2 min_max_value = (float2)(FLT_MAX, -FLT_MAX);
+
+ for(int z = 0; z < DEPTH; ++z)
+ {
+ for(int y = 0; y < HEIGHT; ++y)
+ {
+ int x = 0;
+ __global float *src_addr = (__global float *)(src.ptr + y * src_stride_y + z * src_stride_z);
+
+ for(; x <= (int)(WIDTH - 8); x += 8)
+ {
+ float8 value = *(src_addr + x);
+
+ min_value = select(value.s0123, min_value, min_value < value.s0123);
+ min_value = select(value.s4567, min_value, min_value < value.s4567);
+
+ max_value = select(value.s0123, max_value, max_value > value.s0123);
+ max_value = select(value.s4567, max_value, max_value > value.s4567);
+ }
+
+ for(; x < WIDTH; ++x)
+ {
+ float value = *(src_addr + x);
+
+ min_max_value.s0 = min(min_max_value.s0, value);
+ min_max_value.s1 = max(min_max_value.s1, value);
+ }
+ }
+ }
+
+ // Perform min/max reduction
+ min_value.s01 = min(min_value.s01, min_value.s23);
+ min_value.s0 = min(min_value.s0, min_value.s1);
+ max_value.s01 = max(max_value.s01, max_value.s23);
+ max_value.s0 = max(max_value.s0, max_value.s1);
+
+ min_max_value.s0 = min(min_max_value.s0, min_value.s0);
+ min_max_value.s1 = max(min_max_value.s1, max_value.s0);
+
+ if(min_max_value.s0 == min_max_value.s1)
+ {
+ min_max_value.s0 = 0.0f;
+ min_max_value.s1 = 1.0f;
+ }
+
+ // Store min and max
+ vstore2(min_max_value, 0, (__global float *)dst.ptr);
+}
+#endif // defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/minmaxloc.cl b/src/core/CL/cl_kernels/minmaxloc.cl
index 799b1e8..0f557a4 100644
--- a/src/core/CL/cl_kernels/minmaxloc.cl
+++ b/src/core/CL/cl_kernels/minmaxloc.cl
@@ -26,15 +26,26 @@
#ifndef DATA_TYPE_MIN
#define DATA_TYPE_MIN 0x0
-#endif
+#endif /* DATA_TYPE_MIN */
#ifndef DATA_TYPE_MAX
#define DATA_TYPE_MAX 0xFF
-#endif
+#endif /* DATA_TYPE_MAX */
+
+inline int FloatFlip(float val)
+{
+ union
+ {
+ int int_val;
+ float flt_val;
+ } u_val;
+ u_val.flt_val = val;
+ return (u_val.int_val >= 0) ? u_val.int_val : u_val.int_val ^ 0x7FFFFFFF;
+}
__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MIN);
__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_max = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MAX);
-__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+__constant int16 idx16 = (int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
/** This function identifies the min and maximum value of an input image.
*
@@ -54,7 +65,7 @@
__kernel void minmax(
IMAGE_DECLARATION(src),
__global int *min_max,
- uint width)
+ int width)
{
Image src = CONVERT_TO_IMAGE_STRUCT(src);
@@ -65,11 +76,11 @@
local_max = type_min;
// Calculate min/max of row
- uint width4 = width >> 4;
- for(uint i = 0; i < width4; i++)
+ int i = 0;
+ for(; i + 16 <= width; i += 16)
{
VEC_DATA_TYPE(DATA_TYPE, 16)
- data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
+ data = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0));
local_min = min(data, local_min);
local_max = max(data, local_max);
}
@@ -77,12 +88,16 @@
#ifdef NON_MULTIPLE_OF_16
// Handle non multiple of 16
VEC_DATA_TYPE(DATA_TYPE, 16)
- data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
+ data = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0));
+#ifdef IS_DATA_TYPE_FLOAT
+ int16 valid_indices = (i + idx16) < width;
+#else /* IS_DATA_TYPE_FLOAT */
VEC_DATA_TYPE(DATA_TYPE, 16)
- widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16));
- local_max = max(local_max, select(type_min, data, widx));
- local_min = min(local_min, select(type_max, data, widx));
-#endif
+ valid_indices = CONVERT((i + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16));
+#endif /* IS_DATA_TYPE_FLOAT */
+ local_max = max(local_max, select(type_min, data, valid_indices));
+ local_min = min(local_min, select(type_max, data, valid_indices));
+#endif /* NON_MULTIPLE_OF_16 */
// Perform min/max reduction
local_min.s01234567 = min(local_min.s01234567, local_min.s89ABCDEF);
@@ -98,8 +113,13 @@
local_max.s0 = max(local_max.s0, local_max.s1);
// Update global min/max
+#ifdef IS_DATA_TYPE_FLOAT
+ atomic_min(&min_max[0], FloatFlip(local_min.s0));
+ atomic_max(&min_max[1], FloatFlip(local_max.s0));
+#else /* IS_DATA_TYPE_FLOAT */
atomic_min(&min_max[0], local_min.s0);
atomic_max(&min_max[1], local_max.s0);
+#endif /* IS_DATA_TYPE_FLOAT */
}
/** This function counts the min and max occurrences in an image and tags their position.
@@ -124,41 +144,50 @@
IMAGE_DECLARATION(src),
__global int *min_max,
__global uint *min_max_count
-#if defined LOCATE_MIN
+#ifdef LOCATE_MIN
,
__global Coordinates2D *min_loc, uint max_min_loc_count
-#endif
-#if defined LOCATE_MAX
+#endif /* LOCATE_MIN */
+#ifdef LOCATE_MAX
,
__global Coordinates2D *max_loc, uint max_max_loc_count
-#endif
+#endif /* LOCATE_MAX */
)
{
Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef IS_DATA_TYPE_FLOAT
+ __global float *min_max_ptr = (__global float *)min_max;
+ float min_value = min_max_ptr[0];
+ float max_value = min_max_ptr[1];
+#else /* IS_DATA_TYPE_FLOAT */
+ int min_value = min_max[0];
+ int max_value = min_max[1];
+#endif /* IS_DATA_TYPE_FLOAT */
+
DATA_TYPE value = *((__global DATA_TYPE *)src.ptr);
-#if defined COUNT_MIN_MAX
- if(value == min_max[0])
+#ifdef COUNT_MIN_MAX
+ if(value == min_value)
{
uint idx = atomic_inc(&min_max_count[0]);
-#if defined LOCATE_MIN
+#ifdef LOCATE_MIN
if(idx < max_min_loc_count)
{
min_loc[idx].x = get_global_id(0);
min_loc[idx].y = get_global_id(1);
}
-#endif
+#endif /* LOCATE_MIN */
}
- if(value == min_max[1])
+ if(value == max_value)
{
uint idx = atomic_inc(&min_max_count[1]);
-#if defined LOCATE_MAX
+#ifdef LOCATE_MAX
if(idx < max_max_loc_count)
{
max_loc[idx].x = get_global_id(0);
max_loc[idx].y = get_global_id(1);
}
-#endif
+#endif /* LOCATE_MAX */
}
-#endif
+#endif /* COUNT_MIN_MAX */
}
diff --git a/src/core/CL/cl_kernels/non_linear_filter3x3.cl b/src/core/CL/cl_kernels/non_linear_filter3x3.cl
index f860c96..19118ea 100644
--- a/src/core/CL/cl_kernels/non_linear_filter3x3.cl
+++ b/src/core/CL/cl_kernels/non_linear_filter3x3.cl
@@ -54,13 +54,13 @@
uchar16 bottom = vload16(0, offset(&src, -1, 1));
// Apply respective filter
-#if defined MIN
- uchar16 tmp = min(top, min(middle, bottom));
- uchar8 out = row_reduce_min_3(tmp);
-#elif defined MAX
+#ifdef MIN
+ uchar16 tmp = min(top, min(middle, bottom));
+ uchar8 out = row_reduce_min_3(tmp);
+#elif defined(MAX)
uchar16 tmp = max(top, max(middle, bottom));
uchar8 out = row_reduce_max_3(tmp);
-#elif defined MEDIAN
+#elif defined(MEDIAN)
uchar8 p0 = top.s01234567;
uchar8 p1 = top.s12345678;
uchar8 p2 = top.s23456789;
@@ -71,9 +71,9 @@
uchar8 p7 = bottom.s12345678;
uchar8 p8 = bottom.s23456789;
uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else
+#else /* MIN or MAX or MEDIAN */
#error "Unsupported filter function"
-#endif
+#endif /* MIN or MAX or MEDIAN */
// Store result
vstore8(out, 0, dst.ptr);
@@ -109,22 +109,22 @@
uchar8 bottom = vload8(0, offset(&src, 0, 1));
// Apply respective filter
-#if defined MIN
- uchar8 tmp_middle = row_reduce_min_3(middle);
- uchar8 out = min(tmp_middle, min(top, bottom));
-#elif defined MAX
+#ifdef MIN
+ uchar8 tmp_middle = row_reduce_min_3(middle);
+ uchar8 out = min(tmp_middle, min(top, bottom));
+#elif defined(MAX)
uchar8 tmp_middle = row_reduce_max_3(middle);
uchar8 out = max(tmp_middle, max(top, bottom));
-#elif defined MEDIAN
+#elif defined(MEDIAN)
uchar8 p0 = top.s01234567;
uchar8 p1 = middle.s01234567;
uchar8 p2 = middle.s12345678;
uchar8 p3 = middle.s23456789;
uchar8 p4 = bottom.s01234567;
uchar8 out = sort5(p0, p1, p2, p3, p4);
-#else
+#else /* MIN or MAX or MEDIAN */
#error "Unsupported filter function"
-#endif
+#endif /* MIN or MAX or MEDIAN */
// Store result
vstore8(out, 0, dst.ptr);
@@ -160,13 +160,13 @@
uchar16 bottom = vload16(0, offset(&src, -1, 1));
// Apply respective filter
-#if defined MIN
- uchar16 tmp = min(top, min(middle, bottom));
- uchar8 out = row_reduce_min_3(tmp);
-#elif defined MAX
+#ifdef MIN
+ uchar16 tmp = min(top, min(middle, bottom));
+ uchar8 out = row_reduce_min_3(tmp);
+#elif defined(MAX)
uchar16 tmp = max(top, max(middle, bottom));
uchar8 out = row_reduce_max_3(tmp);
-#elif defined MEDIAN
+#elif defined(MEDIAN)
uchar8 p0 = top.s01234567;
uchar8 p1 = top.s12345678;
uchar8 p2 = top.s23456789;
@@ -177,9 +177,9 @@
uchar8 p7 = bottom.s12345678;
uchar8 p8 = bottom.s23456789;
uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else
+#else /* MIN or MAX or MEDIAN */
#error "Unsupported filter function"
-#endif
+#endif /* MIN or MAX or MEDIAN */
// Store result
vstore8(out, 0, dst.ptr);
diff --git a/src/core/CL/cl_kernels/non_linear_filter5x5.cl b/src/core/CL/cl_kernels/non_linear_filter5x5.cl
index d9ae95f..d3b2958 100644
--- a/src/core/CL/cl_kernels/non_linear_filter5x5.cl
+++ b/src/core/CL/cl_kernels/non_linear_filter5x5.cl
@@ -351,17 +351,17 @@
uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
// Apply respective filter
-#if defined MIN
- uchar16 tmp = min(middle, min(min(top2, top), min(bottom, bottom2)));
- uchar8 out = row_reduce_min_5(tmp);
-#elif defined MAX
+#ifdef MIN
+ uchar16 tmp = min(middle, min(min(top2, top), min(bottom, bottom2)));
+ uchar8 out = row_reduce_min_5(tmp);
+#elif defined(MAX)
uchar16 tmp = max(middle, max(max(top2, top), max(bottom, bottom2)));
uchar8 out = row_reduce_max_5(tmp);
-#elif defined MEDIAN
+#elif defined(MEDIAN)
uchar8 out = median_box5x5(top2, top, middle, bottom, bottom2);
-#else
+#else /* MIN or MAX or MEDIAN */
#error "Unsupported filter function"
-#endif
+#endif /* MIN or MAX or MEDIAN */
// Store result
vstore8(out, 0, dst.ptr);
@@ -392,33 +392,33 @@
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
// Load values
- uchar16 top2 = vload16(0, offset(&src, 0, -2));
- uchar16 top = vload16(0, offset(&src, 0, -1));
+ uchar8 top2 = vload8(0, offset(&src, 0, -2));
+ uchar8 top = vload8(0, offset(&src, 0, -1));
uchar16 middle = vload16(0, offset(&src, -2, 0));
- uchar16 bottom = vload16(0, offset(&src, 0, 1));
- uchar16 bottom2 = vload16(0, offset(&src, 0, 2));
+ uchar8 bottom = vload8(0, offset(&src, 0, 1));
+ uchar8 bottom2 = vload8(0, offset(&src, 0, 2));
// Apply respective filter
-#if defined MIN
- uchar8 tmp_middle = row_reduce_min_5(middle);
- uchar8 out = min(tmp_middle, min(min(top2.s01234567, top.s01234567), min(bottom.s01234567, bottom2.s01234567)));
-#elif defined MAX
+#ifdef MIN
+ uchar8 tmp_middle = row_reduce_min_5(middle);
+ uchar8 out = min(tmp_middle, min(min(top2, top), min(bottom, bottom2)));
+#elif defined(MAX)
uchar8 tmp_middle = row_reduce_max_5(middle);
- uchar8 out = max(tmp_middle, max(max(top2.s01234567, top.s01234567), max(bottom.s01234567, bottom2.s01234567)));
-#elif defined MEDIAN
- uchar8 p0 = top2.s01234567;
- uchar8 p1 = top.s01234567;
+ uchar8 out = max(tmp_middle, max(max(top2, top.s01234567), max(bottom, bottom2)));
+#elif defined(MEDIAN)
+ uchar8 p0 = top2;
+ uchar8 p1 = top;
uchar8 p2 = middle.s01234567;
uchar8 p3 = middle.s12345678;
uchar8 p4 = middle.s23456789;
uchar8 p5 = middle.s3456789A;
uchar8 p6 = middle.s456789AB;
- uchar8 p7 = bottom.s01234567;
- uchar8 p8 = bottom2.s01234567;
+ uchar8 p7 = bottom;
+ uchar8 p8 = bottom2;
uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-#else
+#else /* MIN or MAX or MEDIAN */
#error "Unsupported filter function"
-#endif
+#endif /* MIN or MAX or MEDIAN */
// Store result
vstore8(out, 0, dst.ptr);
@@ -449,30 +449,34 @@
Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
// Load values
- uchar16 top2 = vload16(0, offset(&src, -1, -2));
+ uchar16 top2 = vload16(0, offset(&src, -2, -2));
uchar16 top = vload16(0, offset(&src, -2, -1));
uchar16 middle = vload16(0, offset(&src, -2, 0));
uchar16 bottom = vload16(0, offset(&src, -2, 1));
- uchar16 bottom2 = vload16(0, offset(&src, -1, 2));
+ uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
+
+ // Shift top2 and bottom2 values
+ top2 = top2.s123456789ABCDEFF;
+ bottom2 = bottom2.s123456789ABCDEFF;
// Apply respective filter
-#if defined MIN
- uchar16 tmp_3 = min(top2, bottom2);
- uchar16 tmp_5 = min(middle, min(top, bottom));
- uchar8 tmp_3_red = row_reduce_min_3(tmp_3);
- uchar8 tmp_5_red = row_reduce_min_5(tmp_5);
- uchar8 out = min(tmp_3_red, tmp_5_red);
-#elif defined MAX
+#ifdef MIN
+ uchar16 tmp_3 = min(top2, bottom2);
+ uchar16 tmp_5 = min(middle, min(top, bottom));
+ uchar8 tmp_3_red = row_reduce_min_3(tmp_3);
+ uchar8 tmp_5_red = row_reduce_min_5(tmp_5);
+ uchar8 out = min(tmp_3_red, tmp_5_red);
+#elif defined(MAX)
uchar16 tmp_3 = max(top2, bottom2);
uchar16 tmp_5 = max(middle, max(top, bottom));
uchar8 tmp_3_red = row_reduce_max_3(tmp_3);
uchar8 tmp_5_red = row_reduce_max_5(tmp_5);
uchar8 out = max(tmp_3_red, tmp_5_red);
-#elif defined MEDIAN
+#elif defined(MEDIAN)
uchar8 out = median_disk5x5(top2, top, middle, bottom, bottom2);
-#else
+#else /* MIN or MAX or MEDIAN */
#error "Unsupported filter function"
-#endif
+#endif /* MIN or MAX or MEDIAN */
// Store result
vstore8(out, 0, dst.ptr);
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
index 076b0d8..4e65560 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/normalization_layer.cl
@@ -23,132 +23,152 @@
*/
#include "helpers.h"
+#if defined(FIXED_POINT_POSITION)
+
+#include "fixed_point.h"
+#define MUL_OP(x, y) MUL_SAT_OP_EXPAND((x), (y), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define ADD_OP(x, y) ADD_SAT_OP_EXPAND((x), (y), DATA_TYPE, VEC_SIZE)
+#define DIV_OP(x, y) DIV_SAT_OP_VEC_EXPAND((x), (y), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define EXP_OP(x) EXP_OP_EXPAND((x), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define LOG_OP(x) LOG_OP_EXPAND((x), DATA_TYPE, VEC_SIZE, FIXED_POINT_POSITION)
+#define POW_OP(x, y) EXP_OP(MUL_OP(LOG_OP((x)), (y)))
+#define SQCVT_SAT(a) SQCVT_SAT_OP_EXPAND((a), DATA_TYPE, FIXED_POINT_POSITION)
+
+#define LOAD_OP(offset, ptr) vload16(offset, ptr)
+#define STORE_OP(data, offset, ptr) vstore16(data, offset, ptr)
+
+#else // FIXED_POINT_POSITION
+
+#define MUL_OP(x, y) ((x) * (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define POW_OP(x, y) pow((x), (y))
+#define SQCVT_SAT(a) (a)
+
+#define LOAD_OP(offset, ptr) vload4(offset, ptr)
+#define STORE_OP(data, offset, ptr) vstore4(data, offset, ptr)
+
+#endif // FIXED_POINT_POSITION
+
/** Apply cross map normalization.
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
*
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16, F32
- * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[in] squared_input_ptr Pointer to the second source tensor. Supported data types: F16, F32
- * @param[in] squared_input_stride_x Stride of the second source tensor in X dimension (in bytes)
- * @param[in] squared_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] squared_input_stride_y Stride of the second source tensor in Y dimension (in bytes)
- * @param[in] squared_input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] squared_input_stride_z Stride of the second source tensor in Z dimension (in bytes)
- * @param[in] squared_input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] squared_input_offset_first_element_in_bytes The offset of the second element in the second source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16, F32
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] coeff Alpha parameter / norm_size
- * @param[in] beta Beta parameter in the normalization equation
- * @param[in] kappa Kappa parameter in the normalization equation
- * @param[in] radius Number of elements on the right or left side to normalize across
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(squared_input),
- TENSOR3D_DECLARATION(output),
- float coeff,
- float beta,
- float kappa,
- uint radius)
+ TENSOR3D_DECLARATION(output))
{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D squared_in = CONVERT_TO_TENSOR3D_STRUCT(squared_input);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
- DATA_TYPE acc = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
- const int num_of_slices = get_global_size(2);
const int current_slice = get_global_id(2);
- const int left_slice = max(current_slice - (int)radius, (int)0);
- const int right_slice = min(current_slice + (int)radius, (int)(num_of_slices - 1));
+ const int left_slice = max(current_slice - (int)RADIUS, (int)0);
+ const int right_slice = min(current_slice + (int)RADIUS, (int)(NUM_SLICES - 1));
for(int i = left_slice; i <= right_slice; i++)
{
- acc += *(__global DATA_TYPE *)tensor3D_offset(&squared_in, 0, 0, i - current_slice);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i - current_slice));
+ acc = ADD_OP(acc, MUL_OP(values, values));
}
- const float normalized = pow(kappa + coeff * (float)acc, beta);
+ acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized = POW_OP(acc, beta_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized);
- const float normalized_pixel = (float) * ((__global DATA_TYPE *)in.ptr) / normalized;
-
- *(__global DATA_TYPE *)out.ptr = CONVERT(normalized_pixel, DATA_TYPE);
+ STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
}
/** Apply in map normalization.
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
*
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16, F32
- * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[in] squared_input_ptr Pointer to the second source tensor. Supported data types: F16, F32
- * @param[in] squared_input_stride_x Stride of the second source tensor in X dimension (in bytes)
- * @param[in] squared_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] squared_input_stride_y Stride of the second source tensor in Y dimension (in bytes)
- * @param[in] squared_input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] squared_input_stride_z Stride of the second source tensor in Z dimension (in bytes)
- * @param[in] squared_input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] squared_input_offset_first_element_in_bytes The offset of the second element in the second source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16, F32
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the first destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] coeff Alpha parameter / norm_size
- * @param[in] beta Beta parameter in the normalization equation
- * @param[in] kappa Kappa parameter in the normalization equation
- * @param[in] radius Number of elements on the right or left side to normalize across
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: QS8/F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void normalization_layer_in_map_1D(TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(squared_input),
- TENSOR3D_DECLARATION(output),
- float coeff,
- float beta,
- float kappa,
- uint radius)
+ TENSOR3D_DECLARATION(output))
{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D squared_in = CONVERT_TO_TENSOR3D_STRUCT(squared_input);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
- VEC_DATA_TYPE(DATA_TYPE, 4)
- acc_vec = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
const int current_pos = get_global_id(0) << 2;
- const int left_pos = max(current_pos - (int)radius, -3);
- const int right_pos = min(current_pos + (int)radius, (int)((get_global_size(0) << 2) + 3 - 1));
+ const int left_pos = max(current_pos - (int)RADIUS, -3);
+ const int right_pos = min(current_pos + (int)RADIUS, (int)((get_global_size(0) << 2) + 3 - 1));
for(int i = left_pos; i <= right_pos; i += 1)
{
- acc_vec += vload4(0, (__global DATA_TYPE *)tensor3D_offset(&squared_in, i - current_pos, 0, 0));
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, i - current_pos, 0, 0));
+ acc = ADD_OP(acc, MUL_OP(values, values));
}
- const float4 normalized = pow((float4)kappa + coeff * (float4)acc_vec, beta);
+ acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized = POW_OP(acc, beta_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized);
- const float4 normalized_pixel = CONVERT(vload4(0, (__global DATA_TYPE *)in.ptr), float4) / normalized;
-
- vstore4(CONVERT(normalized_pixel, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)out.ptr);
+ STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
}
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
index ae2031f..f4f36a0 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
@@ -25,9 +25,9 @@
#ifdef SATURATE
#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
-#else
+#else /* SATURATE */
#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
-#endif
+#endif /* SATURATE */
#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
@@ -43,31 +43,37 @@
* @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32
* @param[in] in2_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
* @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16, F16, F32
* @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
* @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
* @param[in] scale Float scaling factor. Supported data types: F32
*/
__kernel void pixelwise_mul_float(
- IMAGE_DECLARATION(in1),
- IMAGE_DECLARATION(in2),
- IMAGE_DECLARATION(out),
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out),
const float scale)
{
// Get pixels pointer
- Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
- Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
- Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
// Load data
VEC_DATA_TYPE(DATA_TYPE_RES, 16)
@@ -76,13 +82,13 @@
in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
// Perform multiplication
-#if defined DATA_TYPE_FLOAT
+#ifdef DATA_TYPE_FLOAT
VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
- res = CONVERT(in1_data * in2_data * scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-#else
+ res = CONVERT(in1_data * in2_data * (DATA_TYPE_RES)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+#else /* DATA_TYPE_FLOAT */
VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data * in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
-#endif
+#endif /* DATA_TYPE_FLOAT */
// Store result
vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
index 05c437c..b5734a3 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
@@ -23,12 +23,28 @@
*/
#include "helpers.h"
-#ifdef SATURATE
-#define CONVERT_OP_INT_STR(x, type) (convert_##type##_sat(x))
-#else
-#define CONVERT_OP_INT_STR(x, type) (convert_##type(x))
-#endif
-#define CONVERT_OP_INT(x, type) CONVERT_OP_INT_STR(x, type)
+#if defined(FIXED_POINT_POSITION)
+
+#include "fixed_point.h"
+
+#if defined(SATURATE)
+#define MUL_OP(x, y, scale, type, size) MUL_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
+#else // SATURATE
+#define MUL_OP(x, y, scale, type, size) MUL_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
+#endif // SATURATE
+
+#else // FIXED_POINT_POSITION
+
+#if defined(SATURATE)
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
+#else // SATURATE
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
+#endif // SATURATE
+#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
+
+#define MUL_OP(x, y, scale, type, size) CONVERT_OP_INT((x) * (y) >> scale, type, size)
+
+#endif // FIXED_POINT_POSITION
/** Performs a pixelwise multiplication with integer scale of integer inputs.
*
@@ -36,37 +52,44 @@
* e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
* @attention The data_type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES.
* e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3
*
- * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16
+ * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/QS8/QS16/S16
* @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16
+ * @param[in] in2_ptr Pointer to the source image. Supported data types: same as @p in1_ptr
* @param[in] in2_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in1_ptr
* @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
* @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] scale Integer scaling factor. Supported data types: S32
+ * @param[in] scale Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1).
*/
__kernel void pixelwise_mul_int(
- IMAGE_DECLARATION(in1),
- IMAGE_DECLARATION(in2),
- IMAGE_DECLARATION(out),
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out),
const uint scale)
{
// Get pixels pointer
- Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
- Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
- Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
// Load data
VEC_DATA_TYPE(DATA_TYPE_RES, 16)
@@ -75,5 +98,5 @@
in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
// Perform multiplication and store result
- vstore16(CONVERT_OP_INT(((in1_data * in2_data) >> scale), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+ vstore16(MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
}
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 1902df9..99d7e6e 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -23,29 +23,186 @@
*/
#include "helpers.h"
-#if defined POOL_AVG
-#define POOL_OP(x, y) ((x) + (y))
-#else
-#define POOL_OP(x, y) (fmax((x), (y)))
-#endif
+#ifdef FIXED_POINT_POSITION
-float calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,
- const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+#include "fixed_point.h"
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) add_sat(x, y)
+#else /* POOL_AVG */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* POOL_AVG */
+
+#define DIV_OP1(x, y) DIV_SAT_OP_EXPAND((x), (y), DATA_TYPE, FIXED_POINT_POSITION)
+#define DIV_OP(x, y) DIV_OP1(x, y << FIXED_POINT_POSITION)
+#define SQRT_OP(x) DIV_OP1((1 << FIXED_POINT_POSITION), (INVSQRT_OP_EXPAND((x), DATA_TYPE, 1, FIXED_POINT_POSITION)))
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) MUL_SAT_OP_EXPAND((x), (x), DATA_TYPE, vec_size, FIXED_POINT_POSITION)
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#else /* FIXED_POINT_POSITION */
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#endif /* FIXED_POINT_POSITION */
+
+#if STRIDE_X == 1
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)
+#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)
+#endif /* STRIDE_X == 3 */
+
+#define POOLING3x3_STRIDE1(res, input, output) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ data00 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
+ VEC_DATA_TYPE(DATA_TYPE, 2) \
+ data01 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ data10 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
+ VEC_DATA_TYPE(DATA_TYPE, 2) \
+ data11 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ data20 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
+ VEC_DATA_TYPE(DATA_TYPE, 2) \
+ data21 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4); \
+ data00 = POW2_OP(data00, 4); \
+ data01 = POW2_OP(data01, 2); \
+ data10 = POW2_OP(data10, 4); \
+ data11 = POW2_OP(data11, 2); \
+ data20 = POW2_OP(data20, 4); \
+ data21 = POW2_OP(data21, 2); \
+ \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01212323); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01); \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01212323); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01); \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01212323); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01); \
+ \
+ values00 = POOL_OP(values00, values10); \
+ values01 = POOL_OP(values01, values11); \
+ values00 = POOL_OP(values00, values20); \
+ values01 = POOL_OP(values01, values21); \
+ \
+ res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \
+ res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03)); \
+ })
+
+#define POOLING3x3_STRIDE2(res, input, output) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ data00 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
+ DATA_TYPE data01 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ data10 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
+ DATA_TYPE data11 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
+ DATA_TYPE data21 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \
+ data00 = POW2_OP(data00, 8); \
+ data01 = POW2_OP(data01, 1); \
+ data10 = POW2_OP(data10, 8); \
+ data11 = POW2_OP(data11, 1); \
+ data20 = POW2_OP(data20, 8); \
+ data21 = POW2_OP(data21, 1); \
+ \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01223445); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s667, data01); \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01223445); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data10.s667, data11); \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01223445); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data20.s667, data21); \
+ \
+ values00 = POOL_OP(values00, values10); \
+ values01 = POOL_OP(values01, values11); \
+ values00 = POOL_OP(values00, values20); \
+ values01 = POOL_OP(values01, values21); \
+ \
+ res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \
+ res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03)); \
+ })
+
+#define POOLING3x3_STRIDE3(res, input, output) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ data00 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ data01 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ data10 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ data11 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \
+ VEC_DATA_TYPE(DATA_TYPE, 8) \
+ data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ data21 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \
+ data00 = POW2_OP(data00, 8); \
+ data01 = POW2_OP(data01, 4); \
+ data10 = POW2_OP(data10, 8); \
+ data11 = POW2_OP(data11, 4); \
+ data20 = POW2_OP(data20, 8); \
+ data21 = POW2_OP(data21, 4); \
+ \
+ data00 = POOL_OP(data00, data10); \
+ data01 = POOL_OP(data01, data11); \
+ data00 = POOL_OP(data00, data20); \
+ data01 = POOL_OP(data01, data21); \
+ \
+ res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s147, data01.s2)); \
+ res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s25, data01.s03)); \
+ })
+
+DATA_TYPE calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+ const int pad_x, const int pad_y, const int stride_x, const int stride_y)
{
- int start_x = get_global_id(0) * stride_x - pad_x;
- int start_y = get_global_id(1) * stride_y - pad_y;
- int end_x = min(start_x + pool_size, upper_bound_w);
- int end_y = min(start_y + pool_size, upper_bound_h);
- return 1.f / ((end_y - start_y) * (end_x - start_x));
+ const int start_x = get_global_id(0) * stride_x - pad_x;
+ const int start_y = get_global_id(1) * stride_y - pad_y;
+ const int end_x = min(start_x + pool_size, upper_bound_w);
+ const int end_y = min(start_y + pool_size, upper_bound_h);
+ return ((end_y - start_y) * (end_x - start_x));
}
/** Performs a pooling function of pool size equal to 2.
*
- * @note Pooling stride must be passed using -DPOOL_STRIDE e.g -DPOOL_STRIDE=2. Supported strides are 1,2,3
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
- * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: F16, F32
+ * @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
@@ -53,7 +210,7 @@
* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
@@ -61,18 +218,10 @@
* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] max_dims The maximum index that can be accessed in x and y dimension (width + pad)
- * @param[in] strides The pooling operation strides in each dimension
- * @param[in] paddings The pooling operation paddings in each dimension
*/
__kernel void pooling_layer_2(
TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output)
-#ifdef POOL_AVG
- ,
- int2 max_dims, int2 strides, int2 paddings
-#endif
-)
+ TENSOR3D_DECLARATION(output))
{
// Get pixels pointer
Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
@@ -84,26 +233,40 @@
VEC_DATA_TYPE(DATA_TYPE, 2)
data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 = POW2_OP(data0, 2);
+ data1 = POW2_OP(data1, 2);
+#endif /* defined(POOL_L2) */
+
// Perform calculations
data0 = POOL_OP(data0, data1);
DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
- // Divide by 4 in case of average pooling
-#ifdef POOL_AVG
- res *= calculate_avg_scale(2, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y);
-#endif
+#if defined(POOL_AVG) || defined(POOL_L2)
+ // Divide by pool region in case of average or l2 pooling
+ res = DIV_OP(res, calculate_avg_scale(2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
// Store result
*(__global DATA_TYPE *)output.ptr = res;
}
-/** Performs a pooling function of pool size equal to 3.
+/** Performs a pooling function of pool size equal to 3
*
- * @note Pooling stride must be passed using -DPOOL_STRIDE e.g -DPOOL_STRIDE=2. Supported strides are 1,2,3
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
- * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
*
- * @param[in] input_ptr Pointer to the source image. Supported data types: F16, F32
+ * @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
@@ -111,7 +274,7 @@
* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
@@ -119,18 +282,10 @@
* @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] max_dims The maximum index that can be accessed in x and y dimension (width + pad)
- * @param[in] strides The pooling operation strides in each dimension
- * @param[in] paddings The pooling operation paddings in each dimension
*/
__kernel void pooling_layer_3(
TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output)
-#ifdef POOL_AVG
- ,
- int2 max_dims, int2 strides, int2 paddings
-#endif
-)
+ TENSOR3D_DECLARATION(output))
{
// Get pixels pointer
Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
@@ -144,16 +299,306 @@
VEC_DATA_TYPE(DATA_TYPE, 3)
data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 = POW2_OP(data0, 3);
+ data1 = POW2_OP(data1, 3);
+ data2 = POW2_OP(data2, 3);
+#endif /* defined(POOL_L2) */
+
// Perform calculations
data0 = POOL_OP(data0, data1);
data0 = POOL_OP(data0, data2);
DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
- // Divide by 4 in case of average pooling
-#ifdef POOL_AVG
- res *= calculate_avg_scale(3, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y);
-#endif
+#if defined(POOL_AVG) || defined(POOL_L2)
+ // Divide by pool region in case of average pooling
+ res = DIV_OP(res, calculate_avg_scale(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
// Store result
*(__global DATA_TYPE *)output.ptr = res;
}
+
+#if defined(POOLING3x3) && !defined(FIXED_POINT_POSITION)
+
+#define CONVERT_OP(data_type) convert_##data_type##4
+#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
+
+VEC_DATA_TYPE(DATA_TYPE, 4)
+calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+ const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ const int4 start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
+ const int start_y = get_global_id(1) * stride_y - pad_y;
+ const int4 end_x = min(start_x + (int4)pool_size, (int4)upper_bound_w);
+ const int end_y = min(start_y + pool_size, upper_bound_h);
+ return (VEC_DATA_TYPE(DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
+}
+
+/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_3_optimized(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ res;
+
+ // Perform pooling 3x3 for 4 output elements
+ POOLING3x3(res, input, output);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ // Divide by pool region in case of average pooling
+ res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ vstore4(res, 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(POOLING3x3) && !defined(FIXED_POINT_POSITION)
+
+/** Performs a pooling function of pool size equal to 7.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_7(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ // Load data
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ data1 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ data2 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ data3 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ data4 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ data5 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ data6 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
+
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 = POW2_OP(data0, 8);
+ data1 = POW2_OP(data1, 8);
+ data2 = POW2_OP(data2, 8);
+ data3 = POW2_OP(data3, 8);
+ data4 = POW2_OP(data4, 8);
+ data5 = POW2_OP(data5, 8);
+ data6 = POW2_OP(data6, 8);
+#endif /* defined(POOL_L2) */
+
+ // Pool operation of all rows
+ data0 = POOL_OP(data0, data1);
+ data2 = POOL_OP(data2, data3);
+ data4 = POOL_OP(data4, data5);
+ data0 = POOL_OP(data0, data2);
+ data4 = POOL_OP(data4, data6);
+ data0 = POOL_OP(data0, data4);
+
+ // Set last element
+#if defined(POOL_AVG) || defined(POOL_L2)
+ data0.s7 = 0;
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+ data0.s7 = data0.s6;
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+ // Reduce result
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ reduce4 = POOL_OP(data0.s0123, data0.s4567);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
+ DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ // Divide by pool region in case of average pooling
+ res = DIV_OP(res, calculate_avg_scale(7, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ *(__global DATA_TYPE *)output.ptr = res;
+}
+
+#if defined(POOL_SIZE)
+
+// Set the initial value for the pooling operation accordingly with the data type
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define INITIAL_VALUE 0
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#ifdef FIXED_POINT_POSITION
+#define MIN_VAL_EXPAND(type) type##_MIN
+#define MIN_VAL(type) MIN_VAL_EXPAND(type)
+#define INITIAL_VALUE MIN_VAL(DATA_TYPE)
+#define INITIAL_VALUE 0
+#else // FIXED_POINT_POSITION
+#if FP16
+#define INITIAL_VALUE -HALF_MAX
+#else // FP16
+#define INITIAL_VALUE -FLT_MAX
+#endif // FP16
+#endif // FIXED_POINT_POSITION
+
+#endif // POOL_AVG
+
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note -DFP16 must be passed at compile time if half float data type is used
+ * @note Pool size must be passed using -DPOOL_SIZE e.g. -DPOOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_N(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ vdata = INITIAL_VALUE;
+ DATA_TYPE sdata = INITIAL_VALUE;
+
+ // Load data
+ for(int y = 0; y < POOL_SIZE; y++)
+ {
+ int x = 0;
+ for(; x <= ((int)POOL_SIZE - 8); x += 8)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
+#endif /* defined(POOL_L2) */
+ vdata = POOL_OP(vdata, data0);
+ }
+
+ // Leftover
+ for(; x < (int)POOL_SIZE; ++x)
+ {
+ DATA_TYPE data0 = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
+#endif /* defined(POOL_L2) */
+ sdata = POOL_OP(sdata, data0);
+ }
+ }
+
+ // Reduce result
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
+ DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
+ res = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ // Divide by pool region in case of average pooling
+ res = DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ *(__global DATA_TYPE *)output.ptr = res;
+}
+#endif // defined(POOL_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/quantization_layer.cl b/src/core/CL/cl_kernels/quantization_layer.cl
new file mode 100644
index 0000000..80ea540
--- /dev/null
+++ b/src/core/CL/cl_kernels/quantization_layer.cl
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This performs the quantization of floating point inputs to 8-bit unsigned integers.
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: U8
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] min_max_ptr Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Supported data types: F32.
+ * @param[in] min_max_stride_x Stride of the min/max vector in X dimension (in bytes)
+ * @param[in] min_max_step_x min_max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] min_max_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ */
+__kernel void quantization_layer(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output),
+ VECTOR_DECLARATION(min_max))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ // min_max_value.s0 = min, min_max_value.s1 = max
+ const float2 min_max_value = vload2(0, (__global float *)(min_max_ptr + min_max_offset_first_element_in_bytes));
+
+ const float4 vmin = (float4)min_max_value.s0;
+ const float4 vrange = (float4)(min_max_value.s1 - min_max_value.s0);
+
+ // Load data
+ float4 data = vload4(0, (__global float *)input.ptr);
+
+ // Map float values to range [0.0, 1.0]
+ data = (data - vmin) / vrange;
+
+ // Quantize and saturate
+ uchar4 res = convert_uchar4_sat(data * 256.0f);
+
+ // Store result
+ vstore4(res, 0, (__global uchar *)output.ptr);
+}
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
new file mode 100644
index 0000000..d46a226
--- /dev/null
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Calculate square sum of a vector
+ *
+ * @param[in] input Pointer to the first pixel.
+ *
+ * @return square sum of vector.
+ */
+inline DATA_TYPE square_sum(__global const DATA_TYPE *input)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ in = vload16(0, input);
+
+ in *= in;
+
+ in.s01234567 += in.s89ABCDEF;
+ in.s0123 += in.s4567;
+ in.s01 += in.s23;
+
+ return (in.s0 + in.s1);
+}
+
+/** Calculate sum of a vector
+ *
+ * @param[in] input Pointer to the first pixel.
+ *
+ * @return sum of vector.
+ */
+inline DATA_TYPE sum(__global const DATA_TYPE *input)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ in = vload16(0, input);
+
+ in.s01234567 += in.s89ABCDEF;
+ in.s0123 += in.s4567;
+ in.s01 += in.s23;
+
+ return (in.s0 + in.s1);
+}
+
+/** This kernel performs reduction given an operation.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] partial_sum_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
+ * @param[in] partial_sum_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] partial_sum_step_x partial_sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] local_sums Local buffer for storing the partioal sum
+ */
+__kernel void reduction_operation(
+ VECTOR_DECLARATION(src),
+ VECTOR_DECLARATION(partial_sum),
+ __local DATA_TYPE *local_sums)
+{
+ Vector src = CONVERT_TO_VECTOR_STRUCT(src);
+ Vector partial_sum = CONVERT_TO_VECTOR_STRUCT(partial_sum);
+
+ unsigned int lsize = get_local_size(0);
+ unsigned int lid = get_local_id(0);
+
+ local_sums[lid] = OPERATION((__global DATA_TYPE *)src.ptr);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Perform parallel reduction
+ for(unsigned int i = lsize >> 1; i > 0; i >>= 1)
+ {
+ if(lid < i)
+ {
+ local_sums[lid] += local_sums[lid + i];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ if(lid == 0)
+ {
+ ((__global DATA_TYPE *)partial_sum.ptr + get_group_id(0))[0] = local_sums[0];
+ }
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/reshape_layer.cl b/src/core/CL/cl_kernels/reshape_layer.cl
new file mode 100644
index 0000000..23eccbf
--- /dev/null
+++ b/src/core/CL/cl_kernels/reshape_layer.cl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform tensor reshape
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: U8/S8/QS8/U16/S16/QS16/U32/S32/F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] input_shape Input spatial shape
+ * @param[in] output_shape Output spatial shape
+ */
+__kernel void reshape_layer(TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output),
+ int2 input_shape,
+ int2 output_shape)
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+ int3 id = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+
+ // Linearize index
+ int linear_idx = id.x + id.y * input_shape.x + id.z * input_shape.x * input_shape.y;
+
+ // Translate to output
+ int3 out_id;
+ out_id.x = linear_idx % output_shape.x;
+ out_id.y = (linear_idx / output_shape.x) % output_shape.y;
+ out_id.z = linear_idx / (output_shape.x * output_shape.y);
+
+ // Store result
+ *((__global DATA_TYPE *)tensor3D_offset(&out, out_id.x, out_id.y, out_id.z)) = *((__global DATA_TYPE *)in.ptr);
+}
diff --git a/src/core/CL/cl_kernels/roi_pooling_layer.cl b/src/core/CL/cl_kernels/roi_pooling_layer.cl
new file mode 100644
index 0000000..042b102
--- /dev/null
+++ b/src/core/CL/cl_kernels/roi_pooling_layer.cl
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if DATA_SIZE == 32
+#define VEC_SIZE 4
+#define VEC_MAX vec4_max
+#elif DATA_SIZE == 16
+#define VEC_SIZE 8
+#define VEC_MAX vec8_max
+#else /* DATA_SIZE not equals 32 or 16 */
+#error "Unsupported data size"
+#endif /* DATA_SIZE == 32 */
+
+inline DATA_TYPE vec4_max(VEC_DATA_TYPE(DATA_TYPE, 4) vec)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ temp = fmax(vec.lo, vec.hi);
+ return fmax(temp.x, temp.y);
+}
+
+inline DATA_TYPE vec8_max(VEC_DATA_TYPE(DATA_TYPE, 8) vec)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ temp = fmax(vec.lo, vec.hi);
+ return vec4_max(temp);
+}
+
+/** Performs a roi pooling on a single output pixel.
+ *
+ * @param[in] input Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y End y index projected onto the input tensor.
+ * @param[in] pz z index of the input tensor.
+ *
+ * @return A max pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_pool_1x1(const Tensor3D *input, int region_start_x, int region_end_x, int region_start_y, int region_end_y, int pz)
+{
+ // Iterate through the pooling region
+ if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+ {
+ return (DATA_TYPE)0;
+ }
+ else
+ {
+ int num_iter = (int)((region_end_x - region_start_x) / VEC_SIZE);
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ curr_max = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(-FLT_MAX);
+ for(int j = region_start_y; j < region_end_y; ++j)
+ {
+ int i = region_start_x;
+ for(; i < region_start_x + num_iter * VEC_SIZE; i += VEC_SIZE)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(input, i, j, pz));
+ curr_max = fmax(val, curr_max);
+ }
+ for(; i < region_end_x; ++i)
+ {
+ DATA_TYPE val = *(__global DATA_TYPE *)tensor3D_offset(input, i, j, pz);
+ curr_max = fmax(curr_max, val);
+ }
+ }
+ return (DATA_TYPE)VEC_MAX(curr_max);
+ }
+}
+
+/** Performs a roi pooling function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: F16, F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the pooled region of the source image as specifed by ROI
+ * @param[in] rois_ptr Pointer to the rois array. Layout: {x, y, width, height, batch_indx}
+ * @param[in] rois_stride_x Stride of the rois array in X dimension (in bytes)
+ * @param[in] rois_step_x rois_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] rois_offset_first_element_in_bytes The offset of the first element in the rois array
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] input_stride_w Stride of the source image in W dimension (in bytes)
+ * @param[in] output_stride_w Stride of the destination image in W dimension (in bytes)
+ */
+__kernel void roi_pooling_layer(
+ TENSOR3D_DECLARATION(input),
+ VECTOR_DECLARATION(rois),
+ TENSOR3D_DECLARATION(output),
+ unsigned int input_stride_w, unsigned int output_stride_w)
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+ Vector rois = CONVERT_TO_VECTOR_STRUCT_NO_STEP(rois);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+ const int px = get_global_id(0);
+ const int py = get_global_id(1);
+ const int pw = get_global_id(2);
+
+ // Load roi parameters
+ // roi is laid out as follows:
+ // { x, y, width, height, batch_index }
+ const ushort4 roi = vload4(0, (__global ushort *)vector_offset(&rois, pw));
+ const ushort roi_batch = *((__global ushort *)vector_offset(&rois, pw) + 4);
+ const int2 roi_anchor = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
+ const int2 roi_dims = convert_int2_sat(fmax(round(convert_float2(roi.s23) * (float)SPATIAL_SCALE), 1.f));
+
+ // Calculate pooled region start and end
+ const float2 spatial_indx = (float2)(px, py);
+ const float2 pooled_dims = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+ const int2 max_spatial_dims = (int2)(MAX_DIM_X, MAX_DIM_Y);
+ int2 region_start = convert_int2_sat(floor(spatial_indx / pooled_dims * convert_float2(roi_dims))) + roi_anchor;
+ int2 region_end = convert_int2_sat(floor((spatial_indx + 1) / pooled_dims * convert_float2(roi_dims))) + roi_anchor;
+
+ region_start = clamp(region_start, 0, max_spatial_dims);
+ region_end = clamp(region_end, 0, max_spatial_dims);
+
+ // Move input and output pointer across the fourth dimension
+ input.ptr += roi_batch * input_stride_w;
+ output.ptr += pw * output_stride_w;
+
+ for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+ {
+ *(__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz) = (__global DATA_TYPE)roi_pool_1x1(&input,
+ region_start.x,
+ region_end.x,
+ region_start.y,
+ region_end.y, pz);
+ }
+}
diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
index 9ef33b8..0106ce0 100644
--- a/src/core/CL/cl_kernels/scale.cl
+++ b/src/core/CL/cl_kernels/scale.cl
@@ -70,20 +70,20 @@
* @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
* @param[in] input_width Input image width
* @param[in] input_height Input image height
- * @param[in] output_width Output image width
- * @param[in] output_height Output image height
+ * @param[in] scale_x The scale factor along x dimension
+ * @param[in] scale_y The scale factor along y dimension
*/
__kernel void scale_nearest_neighbour(
IMAGE_DECLARATION(in),
IMAGE_DECLARATION(out),
const float input_width,
const float input_height,
- const float output_width,
- const float output_height)
+ const float scale_x,
+ const float scale_y)
{
Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
Image out = CONVERT_TO_IMAGE_STRUCT(out);
- const float2 r = (float2)(input_width / output_width, input_height / output_height);
+ const float2 r = (float2)(scale_x, scale_y);
const float8 tc = clamp_to_border(transform_nearest(get_current_coords(), r), input_width, input_height);
vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
}
@@ -104,20 +104,20 @@
* @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
* @param[in] input_width Input image width
* @param[in] input_height Input image height
- * @param[in] output_width Output image width
- * @param[in] output_height Output image height
+ * @param[in] scale_x The scale factor along x dimension
+ * @param[in] scale_y The scale factor along y dimension
*/
__kernel void scale_bilinear(
IMAGE_DECLARATION(in),
IMAGE_DECLARATION(out),
const float input_width,
const float input_height,
- const float output_width,
- const float output_height)
+ const float scale_x,
+ const float scale_y)
{
Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
Image out = CONVERT_TO_IMAGE_STRUCT(out);
- const float2 r = (float2)(input_width / output_width, input_height / output_height);
- const float8 tc = clamp_to_border(transform_bilinear(get_current_coords(), r), input_width, input_height);
+ const float2 r = (float2)(scale_x, scale_y);
+ const float8 tc = transform_bilinear(get_current_coords(), r);
vstore4(bilinear_interpolate(&in, tc, input_width, input_height), 0, (__global DATA_TYPE *)out.ptr);
}
diff --git a/src/core/CL/cl_kernels/scharr_filter.cl b/src/core/CL/cl_kernels/scharr_filter.cl
index ef9878c..d9b5d07 100644
--- a/src/core/CL/cl_kernels/scharr_filter.cl
+++ b/src/core/CL/cl_kernels/scharr_filter.cl
@@ -52,28 +52,28 @@
#ifdef GRAD_X
,
IMAGE_DECLARATION(dst_gx)
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
,
IMAGE_DECLARATION(dst_gy)
-#endif
+#endif /* GRAD_Y */
)
{
Image src = CONVERT_TO_IMAGE_STRUCT(src);
#ifdef GRAD_X
Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif
+#endif /* GRAD_Y */
// Output pixels
#ifdef GRAD_X
short8 gx = (short8)0;
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
short8 gy = (short8)0;
-#endif
+#endif /* GRAD_Y */
// Row0
uchar16 temp = vload16(0, offset(&src, -1, -1));
@@ -83,12 +83,12 @@
#ifdef GRAD_X
gx += left * (short8)(-3);
gx += right * (short8)(+3);
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
gy += left * (short8)(-3);
gy += middle * (short8)(-10);
gy += right * (short8)(-3);
-#endif
+#endif /* GRAD_Y */
// Row1
temp = vload16(0, offset(&src, -1, 0));
@@ -97,7 +97,7 @@
#ifdef GRAD_X
gx += left * (short8)(-10);
gx += right * (short8)(+10);
-#endif
+#endif /* GRAD_X */
// Row2
temp = vload16(0, offset(&src, -1, 1));
@@ -107,18 +107,18 @@
#ifdef GRAD_X
gx += left * (short8)(-3);
gx += right * (short8)(+3);
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
gy += left * (short8)(+3);
gy += middle * (short8)(+10);
gy += right * (short8)(+3);
-#endif
+#endif /* GRAD_Y */
// Store results
#ifdef GRAD_X
vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif
+#endif /* GRAD_Y */
}
diff --git a/src/core/CL/cl_kernels/sobel_filter.cl b/src/core/CL/cl_kernels/sobel_filter.cl
index 4eb0eef..fc2b0ee 100644
--- a/src/core/CL/cl_kernels/sobel_filter.cl
+++ b/src/core/CL/cl_kernels/sobel_filter.cl
@@ -56,28 +56,28 @@
#ifdef GRAD_X
,
IMAGE_DECLARATION(dst_gx)
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
,
IMAGE_DECLARATION(dst_gy)
-#endif
+#endif /* GRAD_Y */
)
{
Image src = CONVERT_TO_IMAGE_STRUCT(src);
#ifdef GRAD_X
Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif
+#endif /* GRAD_Y */
// Output pixels
#ifdef GRAD_X
short8 gx = (short8)0;
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
short8 gy = (short8)0;
-#endif
+#endif /* GRAD_Y */
// Row0
uchar16 temp = vload16(0, offset(&src, -1, -1));
@@ -87,12 +87,12 @@
#ifdef GRAD_X
gx += left * (short8)(-1);
gx += right * (short8)(+1);
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
gy += left * (short8)(-1);
gy += middle * (short8)(-2);
gy += right * (short8)(-1);
-#endif
+#endif /* GRAD_Y */
// Row1
temp = vload16(0, offset(&src, -1, 0));
@@ -101,7 +101,7 @@
#ifdef GRAD_X
gx += left * (short8)(-2);
gx += right * (short8)(+2);
-#endif
+#endif /* GRAD_X */
// Row2
temp = vload16(0, offset(&src, -1, 1));
@@ -111,20 +111,20 @@
#ifdef GRAD_X
gx += left * (short8)(-1);
gx += right * (short8)(+1);
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
gy += left * (short8)(+1);
gy += middle * (short8)(+2);
gy += right * (short8)(+1);
-#endif
+#endif /* GRAD_Y */
// Store results
#ifdef GRAD_X
vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif
+#endif /* GRAD_Y */
}
/**********************************************/
@@ -261,20 +261,20 @@
#ifdef GRAD_X
,
IMAGE_DECLARATION(dst_gx)
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
,
IMAGE_DECLARATION(dst_gy)
-#endif
+#endif /* GRAD_Y */
)
{
Image src = CONVERT_TO_IMAGE_STRUCT(src);
#ifdef GRAD_X
Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif
+#endif /* GRAD_Y */
// Output pixels
short16 gx_gy = sobel1x5(&src,
@@ -284,10 +284,10 @@
// Store result in dst
#ifdef GRAD_X
vstore8(gx_gy.s01234567, 0, ((__global short *)dst_gx.ptr));
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
vstore8(gx_gy.s89ABCDEF, 0, ((__global short *)dst_gy.ptr));
-#endif
+#endif /* GRAD_Y */
}
/** Apply a 5x1 convolution matrix to two single channel S16 input temporary images
@@ -326,32 +326,32 @@
#ifdef GRAD_X
IMAGE_DECLARATION(src_x),
IMAGE_DECLARATION(dst_gx),
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
IMAGE_DECLARATION(src_y),
IMAGE_DECLARATION(dst_gy),
-#endif
+#endif /* GRAD_Y */
int dummy)
{
#ifdef GRAD_X
Image src_x = CONVERT_TO_IMAGE_STRUCT(src_x);
Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
Image src_y = CONVERT_TO_IMAGE_STRUCT(src_y);
Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif
+#endif /* GRAD_Y */
#ifdef GRAD_X
short8 gx = sobel5x1(&src_x,
1, 4, 6, 4, 1);
vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
short8 gy = sobel5x1(&src_y,
-1, -2, 0, 2, 1);
vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif
+#endif /* GRAD_Y */
}
/**********************************************/
@@ -444,20 +444,20 @@
#ifdef GRAD_X
,
IMAGE_DECLARATION(dst_gx)
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
,
IMAGE_DECLARATION(dst_gy)
-#endif
+#endif /* GRAD_Y */
)
{
Image src = CONVERT_TO_IMAGE_STRUCT(src);
#ifdef GRAD_X
Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif
+#endif /* GRAD_Y */
int8 gx = (int8)0;
int8 gy = (int8)0;
@@ -466,10 +466,10 @@
// Store result in dst
#ifdef GRAD_X
vstore8(gx, 0, ((__global int *)dst_gx.ptr));
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
vstore8(gy, 0, ((__global int *)dst_gy.ptr));
-#endif
+#endif /* GRAD_Y */
}
/** Apply a 7x1 convolution matrix to two single channel S16 input temporary images and output two single channel S16 images and leave the borders undefined.
@@ -507,33 +507,33 @@
#ifdef GRAD_X
IMAGE_DECLARATION(src_x),
IMAGE_DECLARATION(dst_gx),
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
IMAGE_DECLARATION(src_y),
IMAGE_DECLARATION(dst_gy),
-#endif
+#endif /* GRAD_Y */
int dummy)
{
#ifdef GRAD_X
Image src_x = CONVERT_TO_IMAGE_STRUCT(src_x);
Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
Image src_y = CONVERT_TO_IMAGE_STRUCT(src_y);
Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif
+#endif /* GRAD_Y */
// Output pixels
#ifdef GRAD_X
int8 gx = 0;
SOBEL7x1(&src_x, gx, Y);
vstore8(gx, 0, (__global int *)dst_gx.ptr);
-#endif
+#endif /* GRAD_X */
#ifdef GRAD_Y
int8 gy = 0;
SOBEL7x1(&src_y, gy, X);
vstore8(gy, 0, (__global int *)dst_gy.ptr);
-#endif
+#endif /* GRAD_Y */
}
/**********************************************/
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 632b4a5..9b24380 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -23,15 +23,37 @@
*/
#include "helpers.h"
-#if defined USE_F16
-#define MINVAL HALF_MIN
+#ifdef FIXED_POINT_POSITION
+
+#include "fixed_point.h"
+#define MAX_OP(x, y, type, size) MAX_OP_EXPAND(x, y, type, size)
+#define ADD_OP(x, y, type, size) ADD_SAT_OP_EXPAND((x), (y), type, size)
+#define SUB_OP(x, y, type, size) SUB_SAT_OP_EXPAND((x), (y), type, size)
+#define DIV_OP(x, y, type, size) DIV_SAT_OP_VEC_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
+#define EXP_OP(x, type, size) EXP_OP_EXPAND((x), type, size, FIXED_POINT_POSITION)
+
+#define MIN_VAL_EXPAND(type) type##_MIN
+#define MIN_VAL(type) MIN_VAL_EXPAND(type)
+#define MINVAL MIN_VAL(DATA_TYPE)
+#define SELECT_DATA_TYPE EXPAND(DATA_TYPE)
+
+#else /* FIXED_POINT_POSITION */
+
+#define MAX_OP(x, y, type, size) max((x), (y))
+#define ADD_OP(x, y, type, size) ((x) + (y))
+#define SUB_OP(x, y, type, size) ((x) - (y))
+#define DIV_OP(x, y, type, size) ((x) / (y))
+#define EXP_OP(x, type, size) exp((x))
+
+#ifdef USE_F16
+#define MINVAL -HALF_MAX
#define SELECT_DATA_TYPE short
-#define DATA_TYPE half
-#else
-#define MINVAL FLT_MIN
+#else /* USE_F16 */
+#define MINVAL -FLT_MAX
#define SELECT_DATA_TYPE int
-#define DATA_TYPE float
-#endif
+#endif /* USE_F16 */
+
+#endif /* FIXED_POINT_POSITION */
__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -39,30 +61,34 @@
/** Identifies the maximum value across the 1st dimension.
*
* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note In case F16 is used -DUSE_HALF must be passed otherwise the kernel will default to used F32.
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
* @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] width Input image width
*/
__kernel void softmax_layer_max(
- IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(dst),
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
uint width)
{
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+ Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
// Initialize local maximum
VEC_DATA_TYPE(DATA_TYPE, 16)
@@ -74,23 +100,23 @@
{
VEC_DATA_TYPE(DATA_TYPE, 16)
data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
- max_val = max(data, max_val);
+ max_val = MAX_OP(data, max_val, DATA_TYPE, 16);
}
-#if defined NON_MULTIPLE_OF_16
+#ifdef NON_MULTIPLE_OF_16
// Handle non multiple of 16
VEC_DATA_TYPE(DATA_TYPE, 16)
data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
- max_val = max(max_val, select(type_min, data, widx));
-#endif
+ max_val = MAX_OP(max_val, select(type_min, data, widx), DATA_TYPE, 16);
+#endif /* NON_MULTIPLE_OF_16 */
// Perform max reduction
- max_val.s01234567 = max(max_val.s01234567, max_val.s89ABCDEF);
- max_val.s0123 = max(max_val.s0123, max_val.s4567);
- max_val.s01 = max(max_val.s01, max_val.s23);
- max_val.s0 = max(max_val.s0, max_val.s1);
+ max_val.s01234567 = MAX_OP(max_val.s01234567, max_val.s89ABCDEF, DATA_TYPE, 8);
+ max_val.s0123 = MAX_OP(max_val.s0123, max_val.s4567, DATA_TYPE, 4);
+ max_val.s01 = MAX_OP(max_val.s01, max_val.s23, DATA_TYPE, 2);
+ max_val.s0 = MAX_OP(max_val.s0, max_val.s1, DATA_TYPE, 1);
// Store result
*((__global DATA_TYPE *)dst.ptr) = max_val.s0;
@@ -100,46 +126,54 @@
* then gets the exponent of each element as sums all elements across each row.
*
* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note In case F16 is used -DUSE_HALF must be passed otherwise the kernel will default to used F32.
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
* @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: F16, F32
+ * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
* @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)
* @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)
* @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: F16, F32
+ * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
* @param[in] width Input image width
*/
__kernel void softmax_layer_shift_exp_sum(
- IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(max),
- IMAGE_DECLARATION(dst),
- IMAGE_DECLARATION(sum),
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(max),
+ TENSOR3D_DECLARATION(dst),
+ TENSOR3D_DECLARATION(sum),
uint width)
{
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
- Image max = CONVERT_TO_IMAGE_STRUCT(max);
- Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
+ Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
+ Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
// Load max value of 1D logits vector (row)
DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&max, 0, 0));
@@ -154,28 +188,30 @@
{
VEC_DATA_TYPE(DATA_TYPE, 16)
data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
- data = exp(data - max_val);
+ data = SUB_OP(data, max_val, DATA_TYPE, 16);
+ data = EXP_OP(data, DATA_TYPE, 16);
vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, i << 4, 0));
- sum1D += data;
+ sum1D = ADD_OP(sum1D, data, DATA_TYPE, 16);
}
-#if defined NON_MULTIPLE_OF_16
+#ifdef NON_MULTIPLE_OF_16
// Handle non multiple of 16
VEC_DATA_TYPE(DATA_TYPE, 16)
data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
- data = exp(data - max_val);
+ data = SUB_OP(data, max_val, DATA_TYPE, 16);
+ data = EXP_OP(data, DATA_TYPE, 16);
VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
data = select(0, data, widx);
vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, width4 << 4, 0));
- sum1D += data;
-#endif
+ sum1D = ADD_OP(sum1D, data, DATA_TYPE, 16);
+#endif /* NON_MULTIPLE_OF_16 */
// Perform min/max reduction
- sum1D.s01234567 = sum1D.s01234567 + sum1D.s89ABCDEF;
- sum1D.s0123 = sum1D.s0123 + sum1D.s4567;
- sum1D.s01 = sum1D.s01 + sum1D.s23;
- sum1D.s0 = sum1D.s0 + sum1D.s1;
+ sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
+ sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
+ sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
+ sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
// Calculate and store result
*((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
@@ -184,38 +220,45 @@
/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
*
* @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: F16, F32
+ * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
* @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
* @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
* @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void softmax_layer_norm(
- IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(sum),
- IMAGE_DECLARATION(dst))
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(sum),
+ TENSOR3D_DECLARATION(dst))
{
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
- Image sum = CONVERT_TO_IMAGE_STRUCT_NO_STEP(sum);
+ Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
// Load max value of 1D logits vector (row)
DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
VEC_DATA_TYPE(DATA_TYPE, 16)
data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
- vstore16(data / sum_val, 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
+ vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
}
diff --git a/src/core/CL/cl_kernels/transpose.cl b/src/core/CL/cl_kernels/transpose.cl
index c30158f..c993005 100644
--- a/src/core/CL/cl_kernels/transpose.cl
+++ b/src/core/CL/cl_kernels/transpose.cl
@@ -98,7 +98,10 @@
#ifndef DATA_TYPE_IN_BYTES
#error DATA_TYPE_IN_BYTES not set for the transpose OpenCL kernel
-#endif
+#endif /* not DATA_TYPE_IN_BYTES */
+
+#undef VLOAD
+#undef VSTORE
#if DATA_TYPE_IN_BYTES == 4
#define DATA_TYPE uint
@@ -118,9 +121,9 @@
#define VLOAD(x, y) vload16(x, y)
#define VSTORE(x, y, z) vstore16(x, y, z)
#define BLOCK_SIZE 16
-#else
+#else /* switch DATA_TYPE_IN_BYTES */
#error DATA_TYPE_IN_BYTES not supported for transpose
-#endif
+#endif /* switch DATA_TYPE_IN_BYTES */
/** This OpenCL kernel computes the matrix transposition of input matrix
*
diff --git a/src/core/CL/cl_kernels/warp_perspective.cl b/src/core/CL/cl_kernels/warp_perspective.cl
index 863b6c9..d955e42 100644
--- a/src/core/CL/cl_kernels/warp_perspective.cl
+++ b/src/core/CL/cl_kernels/warp_perspective.cl
@@ -92,7 +92,7 @@
{
Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
Image out = CONVERT_TO_IMAGE_STRUCT(out);
- vstore4(read_texels4(&in, convert_int8(clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height))), 0, out.ptr);
+ vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height))), 0, out.ptr);
}
/** Performs a perspective transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
@@ -124,5 +124,5 @@
{
Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
Image out = CONVERT_TO_IMAGE_STRUCT(out);
- vstore4(bilinear_interpolate(&in, clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height), width, height), 0, out.ptr);
+ vstore4(bilinear_interpolate(&in, apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height), 0, out.ptr);
}
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 83bbe6a..18202c1 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/TensorInfo.h"
@@ -33,32 +34,98 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+
using namespace arm_compute;
-void CLActivationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+CLActivationLayerKernel::CLActivationLayerKernel()
+ : _input(nullptr), _output(nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+}
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ if(output != nullptr)
+ {
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ }
+
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+ const int fixed_point_position = input->info()->fixed_point_position();
+ float a_const = act_info.a();
+ float b_const = act_info.b();
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ a_const = static_cast<int>(lround(a_const * (1 << fixed_point_position)));
+ b_const = static_cast<int>(lround(b_const * (1 << fixed_point_position)));
+ }
// Set build options
std::set<std::string> build_opts;
- build_opts.insert(("-D" + string_from_activation_func(act_info.activation())));
- build_opts.insert(("-D" + ((is_data_type_float(input->info()->data_type())) ? std::string("TYPE_FP") : std::string("TYPE_INT"))));
- build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.insert(("-DA=" + val_to_string(act_info.a())));
- build_opts.insert(("-DB=" + val_to_string(act_info.b())));
+ build_opts.emplace(("-DACT=" + lower_string(string_from_activation_func(act_info.activation()))));
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const)));
+ build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const)));
+ build_opts.emplace(output == nullptr ? "-DIN_PLACE" : "");
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fixed_point_position)));
+ }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer", build_opts));
// Make sure _kernel is initialized before calling the parent's configure
- constexpr unsigned int num_elems_processed_per_iteration = 16;
- ICLSimple3DKernel::configure(input, output, num_elems_processed_per_iteration);
+
+ _input = input;
+ _output = output;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ if(output != nullptr)
+ {
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+ }
+ else
+ {
+ update_window_and_padding(win,
+ AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+ }
+
+ ICLKernel::configure(win);
+}
+
+void CLActivationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ if(_output != nullptr)
+ {
+ add_3D_tensor_argument(idx, _output, slice);
+ }
+ enqueue(queue, *this, slice);
+ }
+ while(collapsed.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
index aaa62d0..65422c2 100644
--- a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
@@ -48,9 +48,37 @@
void CLArithmeticAdditionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+ if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output->info(), Format::S16);
+ }
+ else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
+ else if(input1->info()->data_type() == DataType::F16 && input2->info()->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output->info(), Format::F16);
+ }
+ }
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+ "Output can only be U8 if both inputs are U8");
+ if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
+ {
+ // Check that all data types are the same and all fixed-point positions are the same
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
+ }
_input1 = input1;
_input2 = input2;
@@ -58,18 +86,16 @@
const bool has_float_out = is_data_type_float(output->info()->data_type());
- // Check for invalid combination
- if(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8))
- {
- ARM_COMPUTE_ERROR("You called with the wrong data types.");
- }
-
// Set kernel build options
std::set<std::string> build_opts;
build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+ if(is_data_type_fixed_point(input1->info()->data_type()))
+ {
+ build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input1->info()->fixed_point_position()));
+ }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_add", build_opts));
diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
index 4c84727..c5183af 100644
--- a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
@@ -45,17 +45,32 @@
void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- // Check for invalid combination
- if(output->info()->data_type() == DataType::U8)
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+ // Auto initialize output if not initialized
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+ set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+ if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output->info(), Format::S16);
+ }
+ else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
}
- else
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+ "Output can only be U8 if both inputs are U8");
+ if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ // Check that all data types are the same and all fixed-point positions are the same
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
}
_input1 = input1;
@@ -70,6 +85,10 @@
build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+ if(is_data_type_fixed_point(input1->info()->data_type()))
+ {
+ build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input1->info()->fixed_point_position()));
+ }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_sub", build_opts));
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 309a153..18c0c97 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -26,12 +26,15 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
using namespace arm_compute;
CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
@@ -39,24 +42,10 @@
{
}
-void CLBatchNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
+void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
float epsilon)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
-
- // Set build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32);
_input = input;
_output = output;
@@ -66,25 +55,56 @@
_gamma = gamma;
_epsilon = epsilon;
+ if(output != nullptr)
+ {
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, mean, var, beta, gamma);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, mean, var, beta, gamma);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var, beta, gamma);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var, beta, gamma);
+ }
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+
+ // Set build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ build_opts.emplace(output == nullptr ? "-DIN_PLACE" : "");
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+ }
+
// Create kernel
- std::string kernel_name = "batchnormalization_layer";
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts));
// Set kernel static arguments
unsigned int idx = 2 * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
_kernel.setArg<cl_float>(idx++, _epsilon);
// Configure kernel window
- const unsigned int num_elems_processed_per_iteration = 4;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->info()->valid_region());
-
+ if(output != nullptr)
+ {
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+ }
+ else
+ {
+ update_window_and_padding(win, input_access);
+ }
ICLKernel::configure(win);
}
@@ -108,7 +128,10 @@
{
idx = 0;
add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
+ if(_output != nullptr)
+ {
+ add_3D_tensor_argument(idx, _output, slice);
+ }
enqueue(queue, *this, slice);
}
while(window.slide_window_slice_3D(slice));
diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp
index e113d30..0299f62 100644
--- a/src/core/CL/kernels/CLBox3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLBox3x3Kernel.cpp
@@ -37,7 +37,7 @@
BorderSize CLBox3x3Kernel::border_size() const
{
- return 1;
+ return BorderSize(1);
}
void CLBox3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp
index 5411533..be046cf 100644
--- a/src/core/CL/kernels/CLChannelExtractKernel.cpp
+++ b/src/core/CL/kernels/CLChannelExtractKernel.cpp
@@ -71,12 +71,12 @@
// Configure window
Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration);
- AccessWindowRectangle output_access(input->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _subsampling, 1.f / _subsampling);
+ AccessWindowRectangle output_access(output->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _subsampling, 1.f / _subsampling);
update_window_and_padding(win, input_access, output_access);
ValidRegion input_valid_region = input->info()->valid_region();
- output_access.set_valid_region(win, ValidRegion(std::move(input_valid_region.anchor), output->info()->tensor_shape()));
+ output_access.set_valid_region(win, ValidRegion(input_valid_region.anchor, output->info()->tensor_shape()));
ICLKernel::configure(win);
}
@@ -115,11 +115,10 @@
// Configure window
Window win = calculate_max_window(*input_plane->info(), Steps(_num_elems_processed_per_iteration));
- AccessWindowHorizontal output_access(input_plane->info(), 0, _num_elems_processed_per_iteration);
+ AccessWindowHorizontal input_access(input_plane->info(), 0, _num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
- update_window_and_padding(win,
- AccessWindowHorizontal(input_plane->info(), 0, _num_elems_processed_per_iteration),
- output_access);
+ update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, input_plane->info()->valid_region());
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index ad66c39..c7884e3 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -43,9 +43,20 @@
void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+ TensorShape output_shape = input->info()->tensor_shape();
+ output_shape.set(0, convolved_dims.first);
+ output_shape.set(1, convolved_dims.second);
+ output_shape.set(2, input->info()->tensor_shape()[0]);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
_input = input;
_output = output;
@@ -53,16 +64,22 @@
// Create kernel
std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts));
+ build_opts.emplace("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.first));
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+ }
- // Set static kernel arguments
- unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
- _kernel.setArg<cl_uint>(idx++, _convolved_dims.first);
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts));
// Configure window
Window win = calculate_max_window(*input->info(), Steps());
+
// The CLCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
ICLKernel::configure(win);
}
@@ -70,16 +87,23 @@
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+ // The collapse method rely on the assumption that the third dimension of input buffer is 1
+ ARM_COMPUTE_ERROR_ON(window.z().end() != 1);
- Window slice_in = window.first_slice_window_2D();
- Window slice_out = window.first_slice_window_3D();
+ Window collapsed_window = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed_window.first_slice_window_3D();
+
+ // Set static kernel arguments
+ unsigned int idx = 2 * num_arguments_per_3D_tensor();
+ _kernel.setArg<cl_uint>(idx++, _output->info()->strides_in_bytes()[3]);
+
do
{
// Set inputs
unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice_in);
- add_3D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice_in);
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
}
- while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_3D(slice_out));
+ while(collapsed_window.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
index bdfe398..fd64dc4 100644
--- a/src/core/CL/kernels/CLConvolutionKernel.cpp
+++ b/src/core/CL/kernels/CLConvolutionKernel.cpp
@@ -79,7 +79,7 @@
options.insert(mat_str.str());
}
- options.insert("-DSCALE=" + val_to_string(scale));
+ options.insert("-DSCALE=" + support::cpp11::to_string(scale));
DataType data_type = data_type_for_convolution_matrix(conv, matrix_size * matrix_size);
options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -143,7 +143,7 @@
for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
{
- build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+ build_opts.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
}
build_opts.insert("-DSCALE=0");
@@ -151,7 +151,7 @@
build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable1x" + val_to_string(matrix_size) + "_static", build_opts));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable1x" + support::cpp11::to_string(matrix_size) + "_static", build_opts));
// Configure kernel window
constexpr unsigned int num_elems_processed_per_iteration = 8;
@@ -195,10 +195,10 @@
for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
{
- build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+ build_opts.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
}
- build_opts.insert("-DSCALE=" + val_to_string(scale));
+ build_opts.insert("-DSCALE=" + support::cpp11::to_string(scale));
build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
@@ -209,7 +209,7 @@
build_opts.insert(out_type.str());
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable" + val_to_string(matrix_size) + "x1_static", build_opts));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable" + support::cpp11::to_string(matrix_size) + "x1_static", build_opts));
// Configure kernel window
constexpr unsigned int num_elems_processed_per_iteration = 8;
@@ -270,16 +270,16 @@
for(unsigned int j = 0; j < MAX_MATRIX_SIZE; j++)
{
- options.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+ options.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j]));
}
- options.insert("-DSCALE=" + val_to_string(scale));
+ options.insert("-DSCALE=" + support::cpp11::to_string(scale));
DataType data_type = data_type_for_convolution_matrix(conv, matrix_size);
options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
- options.insert("-DMATRIX_WIDTH=" + val_to_string(width));
- options.insert("-DMATRIX_HEIGHT=" + val_to_string(height));
+ options.insert("-DMATRIX_WIDTH=" + support::cpp11::to_string(width));
+ options.insert("-DMATRIX_HEIGHT=" + support::cpp11::to_string(height));
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_rectangle", options));
diff --git a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
index 73f1ba1..edfbf82 100644
--- a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
@@ -35,10 +35,14 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <map>
+
using namespace arm_compute;
CLDepthConcatenateKernel::CLDepthConcatenateKernel()
- : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
+ : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
{
}
@@ -49,49 +53,58 @@
void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ static std::map<int, std::pair<std::string, int>> configs_map =
+ {
+ { 1, { "uchar", 16 } },
+ { 2, { "ushort", 8 } },
+ { 4, { "uint", 4 } },
+ { 8, { "ulong", 2 } },
+ };
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+ ARM_COMPUTE_ERROR_ON(configs_map.find(input->info()->element_size()) == configs_map.end());
// The gaps between the two lowest dimensions of input and output need to be divisible by 2
// Otherwise it is not clear how the padding should be added onto the input tensor
ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
- _input = input;
- _output = output;
+ _input = input;
+ _output = output;
+ _depth_offset = depth_offset;
+
+ // Add build options
+ auto config = configs_map.find(static_cast<int>(input->info()->element_size()));
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + config->second.first));
+ build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(config->second.second)));
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth"));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth", build_opts));
// Configure kernel window
_left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
_top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
- const unsigned int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2] + _left_right * output->info()->strides_in_bytes()[0] + _top_bottom *
- output->info()->strides_in_bytes()[1];
-
- const unsigned int num_elems_processed_per_iteration = 4;
- const unsigned int num_elems_read_per_iteration = 4;
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+ const unsigned int num_elems_read_per_iteration = 16 / input->info()->element_size();
const unsigned int num_rows_read_per_iteration = 1;
// The window needs to be based on input as we copy all the depths of input
- Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size());
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
+ AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win,
- AccessWindowRectangle(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration),
- output_access);
-
+ update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
- unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
- _kernel.setArg<unsigned int>(idx, offset_to_first_elements_in_bytes);
-
ICLKernel::configure(win);
}
@@ -100,14 +113,27 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window slice = window.first_slice_window_2D();
+ Window slice = window.first_slice_window_3D();
+
+ const int offset_to_first_elements_in_bytes = _depth_offset * _output->info()->strides_in_bytes()[2];
+
+ unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+ const cl_int3 offsets =
+ {
+ {
+ static_cast<cl_int>(_left_right),
+ static_cast<cl_int>(_top_bottom),
+ static_cast<cl_int>(offset_to_first_elements_in_bytes),
+ }
+ };
+ _kernel.setArg<cl_int3>(idx, offsets);
do
{
unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument(idx, _output, slice);
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice);
}
- while(window.slide_window_slice_2D(slice));
+ while(window.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLDepthConvertKernel.cpp b/src/core/CL/kernels/CLDepthConvertKernel.cpp
index 24608bd..c43884a 100644
--- a/src/core/CL/kernels/CLDepthConvertKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConvertKernel.cpp
@@ -40,13 +40,21 @@
void CLDepthConvertKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16,
+ DataType::U16, DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16,
+ DataType::U16, DataType::U32, DataType::S32, DataType::F32);
ARM_COMPUTE_ERROR_ON(input == output);
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data types must be different");
ARM_COMPUTE_ERROR_ON(shift >= 8);
// Check if convertion is supported
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && output->info()->data_type() != DataType::F32,
+ "Only data types supported [in] QS8 -> [out] F32");
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::F32),
+ "Only data types supported [in] QS16 -> [out] F32");
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && ((output->info()->data_type() != DataType::QS8) && output->info()->data_type() != DataType::QS16),
+ "Only data types supported [in] F32 -> [out] QS8, QS16");
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S16
&& output->info()->data_type() != DataType::U32 && output->info()->data_type() != DataType::S32),
"Only data types supported [in] U8 -> [out] U16, S16, U32, S32");
@@ -67,6 +75,11 @@
&& output->info()->data_type() != DataType::S16),
"Only data types supported [in] S32 -> [out] U8, U16, S16");
+ // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
// Get data sizes
const size_t input_size = data_size_from_type(input->info()->data_type());
const size_t output_size = data_size_from_type(output->info()->data_type());
@@ -83,8 +96,12 @@
{
kernel_name += "_up";
}
- build_opts.insert("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+ if(is_data_type_fixed_point(input->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
+ {
+ build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+ }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
diff --git a/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp
new file mode 100644
index 0000000..6e56835
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+CLDepthwiseConvolution3x3Kernel::CLDepthwiseConvolution3x3Kernel()
+ : _border_size(0), _input(), _output(), _weights(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_x(0), _conv_pad_y(0)
+{
+}
+
+BorderSize CLDepthwiseConvolution3x3Kernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLDepthwiseConvolution3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
+
+ std::pair<unsigned int, unsigned int> expected_output = scaled_dimensions(input->info()->tensor_shape().x(), input->info()->tensor_shape().y(),
+ weights->info()->tensor_shape().x(), weights->info()->tensor_shape().y(),
+ conv_info);
+
+ ARM_COMPUTE_UNUSED(expected_output);
+ ARM_COMPUTE_ERROR_ON(expected_output.first != output->info()->tensor_shape().x());
+ ARM_COMPUTE_ERROR_ON(expected_output.second != output->info()->tensor_shape().y());
+
+ _input = input;
+ _output = output;
+ _weights = weights;
+ _conv_stride_x = conv_info.stride().first;
+ _conv_stride_y = conv_info.stride().second;
+ _conv_pad_x = conv_info.pad().first;
+ _conv_pad_y = conv_info.pad().second;
+ _border_size = BorderSize(_conv_pad_y, _conv_pad_x);
+
+ // Set build options
+ ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3);
+ std::set<std::string> options{ "-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x) };
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_convolution_3x3", options));
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration = 2;
+ const unsigned int num_elems_written_per_iteration = 2;
+ const unsigned int num_elems_read_per_iteration = 3 + _conv_stride_x;
+ const unsigned int num_rows_read_per_iteration = 3;
+
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration, _conv_stride_x, _conv_stride_y);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+ AccessWindowStatic weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
+
+ update_window_and_padding(win, input_access, weights_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLDepthwiseConvolution3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_3D();
+ Window slice_out = window.first_slice_window_3D();
+ Window slice_weights = window.first_slice_window_3D();
+
+ slice_in.adjust(Window::DimX, -_conv_pad_x, true);
+ slice_in.adjust(Window::DimY, -_conv_pad_y, true);
+ slice_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
+ slice_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
+ slice_weights.set_dimension_step(Window::DimX, 0);
+ slice_weights.set_dimension_step(Window::DimY, 0);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ add_3D_tensor_argument(idx, _weights, slice_weights);
+
+ enqueue(queue, *this, slice_out);
+ }
+ while(window.slide_window_slice_3D(slice_out));
+}
diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
new file mode 100644
index 0000000..0eaadb8
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+#include <tuple>
+
+using namespace arm_compute;
+
+CLDepthwiseIm2ColKernel::CLDepthwiseIm2ColKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height));
+
+ _input = input;
+ _output = output;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
+ build_opts.emplace("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
+ build_opts.emplace("-DPAD_X=" + support::cpp11::to_string(conv_info.pad().first));
+ build_opts.emplace("-DPAD_Y=" + support::cpp11::to_string(conv_info.pad().second));
+ build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.emplace("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
+ build_opts.emplace("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_im2col", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+ // The CLDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLDepthwiseIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+ Window slice_in = window.first_slice_window_3D();
+
+ // Setup slice
+ slice.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _output->info()->dimension(0)));
+ slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
+ slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), 1));
+
+ // Setup input slice
+ // The first three dimensions of the input are increased by the inner loops
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
+}
diff --git a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
new file mode 100644
index 0000000..2086b1d
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDepthwiseVectorToTensorKernel::CLDepthwiseVectorToTensorKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLDepthwiseVectorToTensorKernel::configure(const ICLTensor *input, ICLTensor *output, size_t conv_w, size_t conv_h)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+ _input = input;
+ _output = output;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DCONV_WIDTH=" + support::cpp11::to_string(conv_w));
+ build_opts.emplace("-DCONV_HEIGHT=" + support::cpp11::to_string(conv_h));
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_vector_to_tensor", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+ // The CLDepthwisevectorToTensorKernel doesn't need padding so update_window_and_padding() can be skipped
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLDepthwiseVectorToTensorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_1D();
+ Window slice_out = window.first_slice_window_3D();
+
+ // Setup slice
+ slice.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), 1));
+
+ // Setup output slice
+ // The first three dimensions of the output are increased by the inner loops
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_1D(slice) && window.slide_window_slice_3D(slice_out));
+}
diff --git a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
new file mode 100644
index 0000000..68de68b
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDepthwiseWeightsReshapeKernel::CLDepthwiseWeightsReshapeKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLDepthwiseWeightsReshapeKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * input->info()->dimension(1));
+
+ _input = input;
+ _output = output;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_weights_reshape", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+ // The CLDepthwiseWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLDepthwiseWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+ Window slice_out = window.first_slice_window_2D();
+
+ // Setup slice
+ slice.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+ slice.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1));
+ slice.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), 1));
+
+ // Setup output slice
+ // The first two dimensions of the output are increased by the inner loops
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_2D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_3D(slice) && window.slide_window_slice_2D(slice_out));
+}
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
new file mode 100644
index 0000000..216fa27
--- /dev/null
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLDequantizationLayerKernel::CLDequantizationLayerKernel()
+ : _input(nullptr), _output(nullptr), _min_max(nullptr)
+{
+}
+
+void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output, min_max);
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::F32, 0);
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+ _input = input;
+ _output = output;
+ _min_max = min_max;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dequantization_layer"));
+
+ // Configure window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowStatic min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+
+ // Update window and padding
+ update_window_and_padding(win, input_access, output_access, min_max_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLDequantizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
+ Window slice = window_collapsed.first_slice_window_3D();
+
+ Window min_max_window = window;
+ min_max_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+ min_max_window.set(Window::DimY, Window::Dimension(0, _min_max->info()->dimension(1), 1));
+ min_max_window.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Window min_max_slice = min_max_window.first_slice_window_1D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ add_1D_tensor_argument(idx, _min_max, min_max_slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_3D(slice) && min_max_window.slide_window_slice_1D(min_max_slice));
+}
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
new file mode 100644
index 0000000..4224d9b
--- /dev/null
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
+ : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_pad_x(0), _conv_pad_y(0), _conv_stride_x(0), _conv_stride_y(0)
+{
+}
+
+BorderSize CLDirectConvolutionLayerKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) != weights->info()->dimension(1),
+ "Weights should have same width as length");
+ ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) != 1 && weights->info()->dimension(0) != 3 && weights->info()->dimension(0) != 5,
+ "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported");
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON_MSG((weights->info()->dimension(0) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
+ ARM_COMPUTE_ERROR_ON_MSG((weights->info()->dimension(0) == 3 || weights->info()->dimension(0) == 5) && std::get<0>(conv_info.stride()) > 2, "Strides larger than 2 not supported for 3x3 convolution.");
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+ ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+ ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+ }
+
+ const unsigned int kernel_size = weights->info()->dimension(0);
+
+ // Get convolved dimensions
+ unsigned int output_width = 0;
+ unsigned int output_height = 0;
+ std::tie(output_width, output_height) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info);
+
+ TensorShape output_shape = input->info()->tensor_shape();
+ output_shape.set(0, output_width);
+ output_shape.set(1, output_height);
+ output_shape.set(2, weights->info()->dimension(3));
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+ _conv_stride_x = std::get<0>(conv_info.stride());
+ _conv_stride_y = std::get<1>(conv_info.stride());
+ _conv_pad_x = std::min(std::get<0>(conv_info.pad()), kernel_size / 2);
+ _conv_pad_y = std::min(std::get<1>(conv_info.pad()), kernel_size / 2);
+
+ _input = input;
+ _weights = weights;
+ _output = output;
+ _biases = biases;
+ _border_size = BorderSize(_conv_pad_y, _conv_pad_x);
+
+ std::set<std::string> options;
+
+ const GPUTarget gpu_target = get_arch_from_target(get_target());
+
+ if(_biases != nullptr)
+ {
+ options.emplace("-DHAS_BIAS");
+ }
+
+ if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (input->info()->data_type() == DataType::F32))
+ {
+ options.emplace("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2)));
+
+ std::string kernel_name = "direct_convolution" + support::cpp11::to_string(kernel_size) + "x" + support::cpp11::to_string(kernel_size) + "_f32_bifrost";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info());
+
+ unsigned int num_elems_read_per_iteration_x = 0;
+ unsigned int num_elems_read_per_iteration_y = 0;
+ unsigned int num_elems_written_per_iteration_x = 0;
+ unsigned int num_elems_written_per_iteration_y = 0;
+
+ switch(kernel_size)
+ {
+ case 1:
+ {
+ num_elems_read_per_iteration_x = 4;
+ num_elems_read_per_iteration_y = 4;
+ num_elems_written_per_iteration_x = 4;
+ num_elems_written_per_iteration_y = 4;
+ break;
+ }
+ case 3:
+ {
+ num_elems_read_per_iteration_x = 6;
+ num_elems_read_per_iteration_y = 5;
+ num_elems_written_per_iteration_x = 4;
+ num_elems_written_per_iteration_y = 3;
+ break;
+ }
+ case 5:
+ {
+ num_elems_read_per_iteration_x = 8;
+ num_elems_read_per_iteration_y = 6;
+ num_elems_written_per_iteration_x = 4;
+ num_elems_written_per_iteration_y = 2;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
+ }
+ }
+
+ // Calculate right and bottom border
+ const int input_width = input->info()->dimension(0) - kernel_size / 2 + _conv_pad_x;
+ const int input_height = input->info()->dimension(1) - kernel_size / 2 + _conv_pad_y;
+
+ // Create window and update padding
+ win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
+
+ AccessWindowStatic input_access(input->info(), -_conv_pad_x, -_conv_pad_y, input_width + num_elems_read_per_iteration_x, input_height + num_elems_read_per_iteration_y);
+ AccessWindowStatic weights_access(weights->info(), 0, 0, kernel_size, kernel_size);
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
+
+ update_window_and_padding(win, input_access, weights_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+ }
+ else
+ {
+ std::stringstream kernel_name;
+ kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
+ DataType promoted_type = input->info()->data_type();
+
+ options.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ options.emplace("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
+ options.emplace("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2)));
+ options.emplace("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
+
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ options.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+
+ switch(input->info()->data_type())
+ {
+ case DataType::QS8:
+ promoted_type = DataType::QS16;
+ break;
+ case DataType::QS16:
+ promoted_type = DataType::QS32;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Datatype not supported");
+ }
+ }
+
+ options.emplace("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(promoted_type));
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), options));
+
+ // Configure kernel window
+
+ bool is_stride2 = ((kernel_size != 1) && (_conv_stride_x == 2));
+
+ const unsigned int num_elems_read_per_iteration_x = 8 + 2 * (kernel_size / 2) + (is_stride2 ? 6 + kernel_size / 2 : 0);
+ const unsigned int num_elems_read_per_iteration_y = kernel_size;
+ const unsigned int num_elems_written_per_iteration_x = 8;
+ const unsigned int num_elems_written_per_iteration_y = 1;
+
+ // Calculate right and bottom border
+ const int input_width = input->info()->dimension(0) - kernel_size / 2 + _conv_pad_x;
+ const int input_height = input->info()->dimension(1) - kernel_size / 2 + _conv_pad_y;
+
+ // Create window and update padding
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
+
+ AccessWindowStatic input_access(input->info(), -_conv_pad_x, -_conv_pad_y, input_width + num_elems_read_per_iteration_x, input_height + num_elems_read_per_iteration_y);
+ AccessWindowStatic weights_access(weights->info(), 0, 0, kernel_size, kernel_size);
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
+
+ update_window_and_padding(win, input_access, weights_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+ }
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "direct_convolution_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(kernel_size);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_conv_pad_x);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_conv_pad_y);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_conv_stride_x);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_conv_stride_y);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ // Get initial windows
+ Window slice = window.first_slice_window_3D();
+ Window win_in = window;
+
+ win_in.adjust(Window::DimX, -_conv_pad_x, true);
+ win_in.adjust(Window::DimY, -_conv_pad_y, true);
+ win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
+ win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
+
+ Window slice_in = win_in.first_slice_window_3D();
+
+ unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
+ add_3D_tensor_argument(idx1, _weights, slice);
+
+ if(_biases != nullptr)
+ {
+ Window slice_biases;
+ slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
+ add_1D_tensor_argument(idx1, _biases, slice_biases);
+ }
+
+ _kernel.setArg(idx1++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_3D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice, _lws_hint);
+ }
+ while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
+}
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 981aad6..2e066c7 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -29,6 +29,7 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
@@ -76,7 +77,7 @@
// Define select type required by replicate border > 1
const DataType dt = tensor->info()->data_type();
- std::string select_type = get_cl_type_from_data_type(dt);
+ std::string select_type = get_underlying_cl_type_from_data_type(dt);
if(is_data_type_float(dt))
{
select_type = (DataType::F32 == dt) ? "int" : "short";
@@ -84,12 +85,16 @@
// Define build options
std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+ build_opts.emplace(("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt)));
build_opts.emplace(("-DSELECT_TYPE=" + select_type));
- build_opts.emplace(("-DBORDER_SIZE_TOP=" + val_to_string(border_size.top)));
- build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + val_to_string(border_size.bottom)));
- build_opts.emplace(("-DBORDER_SIZE_LEFT=" + val_to_string(border_size.left)));
- build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + val_to_string(border_size.right)));
+ build_opts.emplace(("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top)));
+ build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom)));
+ build_opts.emplace(("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left)));
+ build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + support::cpp11::to_string(border_size.right)));
+ if(is_data_type_fixed_point(tensor->info()->data_type()))
+ {
+ build_opts.emplace("-DFIXED_POINT_POSITION");
+ }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
@@ -108,7 +113,7 @@
const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
// Set static kernel arguments
- unsigned int idx = num_arguments_per_2D_tensor(); //Skip the tensor parameters
+ unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
ICLKernel::add_argument<cl_uint>(idx, valid_width);
ICLKernel::add_argument<cl_uint>(idx, valid_height);
ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
@@ -119,9 +124,14 @@
case DataType::U8:
set_constant_border<uint8_t>(idx, constant_border_value);
break;
+ case DataType::QS8:
+ case DataType::S8:
+ set_constant_border<int8_t>(idx, constant_border_value);
+ break;
case DataType::U16:
set_constant_border<uint16_t>(idx, constant_border_value);
break;
+ case DataType::QS16:
case DataType::S16:
set_constant_border<int16_t>(idx, constant_border_value);
break;
@@ -148,7 +158,7 @@
Window win;
win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height));
win.set(Window::DimY, Window::Dimension(0, 1, 1));
- win.use_tensor_dimensions(tensor->info(), Window::DimZ);
+ win.use_tensor_dimensions(tensor->info()->tensor_shape(), Window::DimZ);
ICLKernel::configure(win);
}
@@ -163,13 +173,13 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
- Window slice = window.first_slice_window_2D();
+ Window slice = window.first_slice_window_3D();
do
{
unsigned int idx = 0;
- add_2D_tensor_argument(idx, _tensor, slice);
+ add_3D_tensor_argument(idx, _tensor, slice);
enqueue(queue, *this, slice, cl::NullRange);
}
- while(window.slide_window_slice_2D(slice));
+ while(window.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLFloorKernel.cpp b/src/core/CL/kernels/CLFloorKernel.cpp
new file mode 100644
index 0000000..6c9f83f
--- /dev/null
+++ b/src/core/CL/kernels/CLFloorKernel.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLFloorKernel::CLFloorKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLFloorKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ set_data_type_if_unknown(*input->info(), DataType::F32);
+ set_data_type_if_unknown(*output->info(), DataType::F32);
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ _input = input;
+ _output = output;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("floor_layer", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLFloorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(collapsed.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index 71d42c5..268260b 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -43,18 +43,27 @@
void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+ TensorShape output_shape = input->info()->tensor_shape();
+ output_shape.set(0, input->info()->dimension(0) * 4);
+ output_shape.set(1, std::ceil(input->info()->dimension(1) / 4.0f));
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4);
- ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(static_cast<float>(input->info()->dimension(1)) / 4.0f));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
_input = input;
_output = output;
// Create kernel
std::string data_type_name;
- data_type_name = val_to_string(input->info()->element_size() * 8) + "bit";
+ data_type_name = support::cpp11::to_string(input->info()->element_size() * 8) + "bit";
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4_" + data_type_name));
// Configure kernel window
@@ -72,6 +81,14 @@
output_access.set_valid_region(win, input->info()->valid_region());
ICLKernel::configure(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "interleave4x4_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
}
void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index c6e05b9..ef572cf 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
@@ -33,6 +33,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
#include <cstddef>
#include <cstdint>
@@ -63,8 +64,8 @@
_output = output;
// Create kernel and set static arguments
- std::set<std::string> build_opts = { ("-DWIDTH_MATRIX_B=" + val_to_string(input1->info()->dimension(0))) };
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_u8", build_opts));
+ std::set<std::string> build_opts = { ("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))) };
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_u8", build_opts));
unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
_kernel.setArg<int32_t>(idx++, a_offset);
_kernel.setArg<int32_t>(idx++, b_offset);
diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
index 289873c..263cfab 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -43,24 +43,30 @@
void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(biases, accum);
ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
_biases = biases;
_accum = accum;
+ std::set<std::string> build_opts;
+ build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type())));
+ if(is_data_type_fixed_point(accum->info()->data_type()))
+ {
+ build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(accum->info()->fixed_point_position()));
+ }
+
// Create kernel
- std::string data_type_name = lower_string(string_from_data_type(accum->info()->data_type()));
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases_" + data_type_name));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts));
// Configure kernel window
- const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(accum->info()->data_type());
+ const unsigned int num_elems_processed_per_iteration = 16;
Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowStatic biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1));
+ AccessWindowStatic biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration), biases->info()->dimension(1));
AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration);
update_window_and_padding(win, biases_access, accum_access);
diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
index 343838f..1499df0 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
@@ -28,6 +28,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
@@ -40,10 +41,9 @@
{
}
-void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *output, const float beta)
+void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *output, float beta)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
@@ -53,7 +53,19 @@
const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type());
std::ostringstream ma_arguments;
- ma_arguments << "-DBETA=" << beta;
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ ma_arguments << "-DBETA=" << (input->info()->data_type() == DataType::QS8 ?
+ sqcvt_qs8_f32(beta, input->info()->fixed_point_position()) :
+ sqcvt_qs16_f32(beta, input->info()->fixed_point_position()))
+ << " ";
+ ma_arguments << "-DFIXED_POINT_POSITION=" << input->info()->fixed_point_position();
+ }
+ else
+ {
+ ma_arguments << "-DBETA=" << beta;
+ }
+
std::set<std::string> build_opts;
build_opts.emplace(ma_arguments.str());
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index d7388e8..b184c50 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -25,12 +25,12 @@
#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/AccessWindowTranspose.h"
-
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
@@ -48,13 +48,13 @@
{
}
-void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha)
+void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- if(output->info()->dimension(1) == 1)
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
+ if(!is_interleaved_transposed)
{
ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
}
@@ -72,51 +72,36 @@
_lws_hint = cl::NDRange(8, 8);
}
- std::ostringstream mm_arguments;
- mm_arguments << "-DWIDTH_MATRIX_B=" << input1->info()->dimension(0) << " ";
- mm_arguments << "-DALPHA=" << alpha << " ";
std::set<std::string> build_opts;
+ build_opts.emplace(("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))));
+ build_opts.emplace(("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))));
- // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
- if(output->info()->dimension(1) == 1)
+ if(is_data_type_fixed_point(input0->info()->data_type()))
{
- mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " ";
- build_opts.emplace(mm_arguments.str());
+ build_opts.emplace(("-DALPHA=" + support::cpp11::to_string((input0->info()->data_type() == DataType::QS8 ?
+ sqcvt_qs8_f32(alpha, input0->info()->fixed_point_position()) :
+ sqcvt_qs16_f32(alpha, input0->info()->fixed_point_position())))));
- // Create kernel
- std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_vm_" + data_type_name), build_opts));
-
- // Configure window kernel
- const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
-
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
-
- AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
- AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
-
- update_window_and_padding(win, input0_access, input1_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
-
- ICLKernel::configure(win);
+ build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input0->info()->fixed_point_position())));
}
else
{
- build_opts.emplace(mm_arguments.str());
+ build_opts.emplace(("-DALPHA=" + float_to_string_with_full_precision(alpha)));
+ }
+ if(is_interleaved_transposed)
+ {
// Create kernel
std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
if(data_type_name == "f32")
{
GPUTarget arch_target = get_arch_from_target(get_target());
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_f32_" + string_from_target(arch_target), build_opts));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target), build_opts));
}
else
{
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_" + data_type_name, build_opts));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_" + data_type_name, build_opts));
}
// Configure window kernel
@@ -135,6 +120,55 @@
ICLKernel::configure(win);
}
+ else // The input tensors have not been reshaped
+ {
+ ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+
+ // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor
+ const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+ const unsigned int num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type())));
+ build_opts.emplace(("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x)));
+ build_opts.emplace(("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y)));
+
+ // Create kernel
+ if(is_data_type_fixed_point(input0->info()->data_type()))
+ {
+ std::string kernel_name = "gemm_mm_" + lower_string(string_from_data_type(input0->info()->data_type()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel((kernel_name), build_opts));
+ }
+ else
+ {
+ std::string kernel_name = "gemm_mm_floating_point";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel((kernel_name), build_opts));
+ }
+
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowStatic input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
+ AccessWindowStatic input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+ update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "gemm_";
+ _config_id += (is_interleaved_transposed ? "reshaped_" : "");
+ _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
+ }
}
void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -144,9 +178,9 @@
Window slice = window.first_slice_window_2D();
Window slice_matrix_b = slice;
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1));
- slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+ slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
do
{
diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
new file mode 100644
index 0000000..70af5d6
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+using namespace arm_compute;
+
+CLGEMMMatrixVectorMultiplyKernel::CLGEMMMatrixVectorMultiplyKernel()
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _num_rows_read_per_iteration(0), _border_size(0)
+{
+}
+BorderSize CLGEMMMatrixVectorMultiplyKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLGEMMMatrixVectorMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+ ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1));
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
+ build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input0->info()->dimension(0)));
+ build_opts.emplace("-DSRC_HEIGHT=" + support::cpp11::to_string(input0->info()->dimension(1)));
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mv", build_opts));
+
+ // Configure kernel window
+ const unsigned int num_elems_read_per_iteration = 4;
+
+ _num_rows_read_per_iteration = 4;
+
+ const unsigned int border_x = ceil_to_multiple(input0->info()->dimension(0), num_elems_read_per_iteration) - input0->info()->dimension(0);
+ const unsigned int border_y = ceil_to_multiple(input0->info()->dimension(1), _num_rows_read_per_iteration) - input0->info()->dimension(1);
+
+ _border_size = BorderSize(border_y, border_x);
+
+ Window win = calculate_max_window(*input0->info(), Steps(num_elems_read_per_iteration));
+
+ AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration, _num_rows_read_per_iteration);
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_read_per_iteration);
+ AccessWindowStatic output_access(_output->info(), 0, 0, _output->info()->dimension(0) + border_x, _output->info()->dimension(1) + border_y);
+
+ update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLGEMMMatrixVectorMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_3D();
+ Window slice_in2 = window.first_slice_window_3D();
+ Window slice_out = window.first_slice_window_3D();
+
+ // Setup input0 slice
+ slice_in.set(Window::DimX, Window::Dimension(0, _input0->info()->dimension(0), _input0->info()->dimension(0)));
+ slice_in.set(Window::DimY, Window::Dimension(0, _input0->info()->dimension(1) + border_size().bottom, _num_rows_read_per_iteration));
+ slice_in.set(Window::DimZ, Window::Dimension(0, _input0->info()->dimension(2), 1));
+
+ // Setup input1 and output slice. Their dimensions are increased in the cl kernel.
+ slice_in2.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in2.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in2.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ unsigned int idx_1 = num_arguments_per_3D_tensor();
+
+ add_2D_tensor_argument(idx_1, _input1, slice_in2);
+
+ do
+ {
+ unsigned int idx_0 = 0;
+ unsigned int idx_2 = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
+ add_3D_tensor_argument(idx_0, _input0, slice_in);
+ add_1D_tensor_argument(idx_2, _output, slice_out);
+ enqueue(queue, *this, slice_in);
+ }
+ while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+}
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index ecee1ab..5057c8f 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -40,8 +40,9 @@
void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON(output == nullptr);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
TensorShape output_shape{ input->info()->tensor_shape() };
const size_t transpose_w = 16 / input->info()->element_size();
@@ -53,10 +54,13 @@
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
- _input = input;
- _output = output;
- const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+ const int scale_x = num_elems_processed_per_iteration;
+
+ _input = input;
+ _output = output;
/*
* Following an example of how the transposition1xW works when the input data type is F32
@@ -66,41 +70,23 @@
* |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
* |a30 a31 a32 a33|
*
- * If the input data type is F32, the output matrix will have the following shape: [ height * 4, width / 4 ]
- * If the input data type is F16, the output matrix will have the following shape: [ height * 8, width / 8 ]
+ * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
*/
// Create kernel
- std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
- std::string kernel_name = "gemm_transpose1x" + val_to_string(num_elems_processed_per_iteration) + "_" + data_type_name;
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+ std::string kernel_name = "gemm_transpose1x" + support::cpp11::to_string(num_elems_processed_per_iteration);
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
// Configure window
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- float scale_x = 1.f;
-
- switch(input->info()->data_type())
- {
- case DataType::U8:
- scale_x = 16.f;
- break;
- case DataType::F16:
- scale_x = 8.f;
- break;
- case DataType::F32:
- scale_x = 4.f;
- break;
- default:
- // Do nothing
- break;
- }
+ ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension");
AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
ICLKernel::configure(win);
}
diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
index 9fc34a7..1f757fe 100644
--- a/src/core/CL/kernels/CLHarrisCornersKernel.cpp
+++ b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
+#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
@@ -91,8 +92,8 @@
// Configure kernel window
constexpr unsigned int num_elems_processed_per_iteration = 4;
constexpr unsigned int num_elems_written_per_iteration = 4;
- constexpr unsigned int num_elems_read_per_iteration = 8;
- constexpr unsigned int num_rows_read_per_iteration = 3;
+ const unsigned int num_elems_read_per_iteration = block_size == 7 ? 10 : 8;
+ const unsigned int num_rows_read_per_iteration = block_size;
Window win = calculate_max_window(*_input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 8c0fe26..98a799f 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -29,8 +29,10 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
#include <cmath>
#include <tuple>
@@ -38,14 +40,15 @@
using namespace arm_compute;
CLIm2ColKernel::CLIm2ColKernel()
- : _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_size(0), _num_elems_processed_per_iteration(1), _run_func(nullptr)
+ : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr)
{
}
-void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias)
+void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
_input = input;
_output = output;
@@ -55,6 +58,11 @@
build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
build_opts.emplace((has_bias ? "-DHAS_BIAS" : ""));
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+ }
+
int pad_x = 0;
int pad_y = 0;
int stride_x = 0;
@@ -70,45 +78,31 @@
if(!run_img2col_reduced)
{
- _convolved_dims = convolved_dims;
- _conv_info = conv_info;
- _kernel_size = std::sqrt((output->info()->dimension(0) - (has_bias ? 1 : 0)) / input->info()->dimension(2));
+ _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
+ kernel_dims.width, kernel_dims.height,
+ conv_info);
_num_elems_processed_per_iteration = output->info()->dimension(0);
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_generic", build_opts));
+ build_opts.emplace("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
+ build_opts.emplace("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
+ build_opts.emplace("-DKERNEL_DEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.emplace("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(_convolved_dims.first));
+ build_opts.emplace("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(_convolved_dims.second));
+ build_opts.emplace("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
+ build_opts.emplace("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
+ build_opts.emplace("-DPAD_X=" + support::cpp11::to_string(conv_info.pad().first));
+ build_opts.emplace("-DPAD_Y=" + support::cpp11::to_string(conv_info.pad().second));
+ build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.emplace("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
- // Create static kernel arguments
- const cl_int2 input_dims =
+ if(kernel_dims.width == 3 && kernel_dims.height == 3 && conv_info.pad().first == 0 && conv_info.pad().second == 0)
{
- {
- static_cast<cl_int>(input->info()->dimension(0)),
- static_cast<cl_int>(input->info()->dimension(1)),
- }
- };
- const cl_int2 strides =
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_kernel3x3_padx0_pady0", build_opts));
+ }
+ else
{
- {
- stride_x,
- stride_y,
- }
- };
- const cl_int2 paddings =
- {
- {
- pad_x,
- pad_y,
- }
- };
-
- // Set static kernel arguments
- unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
- _kernel.setArg<cl_int>(idx++, _kernel_size);
- _kernel.setArg<cl_int>(idx++, input->info()->dimension(2) /* depth */);
- _kernel.setArg<cl_int>(idx++, _convolved_dims.first /* output width */);
- _kernel.setArg<cl_int2>(idx++, input_dims);
- _kernel.setArg<cl_int2>(idx++, strides);
- _kernel.setArg<cl_int2>(idx++, paddings);
-
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_generic", build_opts));
+ }
_run_func = &CLIm2ColKernel::run_generic;
}
else
@@ -122,7 +116,22 @@
Window win = calculate_max_window(*input->info(), Steps());
// The CLIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ if(!run_img2col_reduced)
+ {
+ // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
+ win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
+ }
+
ICLKernel::configure(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "im2col_";
+ _config_id += (run_img2col_reduced ? "reduced_" : "");
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
}
void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -136,22 +145,18 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
- int pad_x = 0;
- int pad_y = 0;
- int stride_x = 0;
- int stride_y = 0;
- std::tie(pad_x, pad_y) = _conv_info.pad();
- std::tie(stride_x, stride_y) = _conv_info.stride();
-
// Get initial windows
- Window slice = window.first_slice_window_3D();
- Window slice_in = window.first_slice_window_3D();
- Window slice_out = window.first_slice_window_3D();
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ // Change the Z dimension's step back to 1
+ window_collapsed.set_dimension_step(Window::DimZ, 1);
+
+ Window slice = window_collapsed.first_slice_window_3D();
+ Window slice_in = window_collapsed.first_slice_window_3D();
+ Window slice_out = window_collapsed.first_slice_window_3D();
// Setup slice
slice.set(Window::DimX, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1));
slice.set(Window::DimY, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
- slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
// Setup input slice
// The first three dimensions of the input are increased by the inner loops
@@ -166,13 +171,15 @@
do
{
- // Set inputs
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice_in);
add_2D_tensor_argument(idx, _output, slice_out);
- enqueue(queue, *this, slice);
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->dimension(2)));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
+ enqueue(queue, *this, slice, _lws_hint);
}
- while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out) && window.slide_window_slice_3D(slice_in));
+ while(window_collapsed.slide_window_slice_3D(slice) && window_collapsed.slide_window_slice_3D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
}
void CLIm2ColKernel::run_reduced(const Window &window, cl::CommandQueue &queue)
@@ -181,7 +188,7 @@
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
Window out_window;
- out_window.use_tensor_dimensions(_output->info());
+ out_window.use_tensor_dimensions(_output->info()->tensor_shape());
Window out_slice = out_window.first_slice_window_1D();
Window in_slice = window.first_slice_window_3D();
diff --git a/src/core/CL/kernels/CLL2NormalizeKernel.cpp b/src/core/CL/kernels/CLL2NormalizeKernel.cpp
new file mode 100644
index 0000000..3e0758c
--- /dev/null
+++ b/src/core/CL/kernels/CLL2NormalizeKernel.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLL2NormalizeKernel::CLL2NormalizeKernel()
+ : _input(nullptr), _sum(nullptr), _output(nullptr), _axis(0), _epsilon(1e-12)
+{
+}
+
+void CLL2NormalizeKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, unsigned int axis, float epsilon)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+ // Sum and output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+ _input = input;
+ _sum = sum;
+ _output = output;
+ _axis = axis;
+ _epsilon = epsilon;
+
+ const unsigned int num_elems_processed_per_iteration = 16;
+
+ // Set build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("l2_normalize", build_opts));
+
+ // Set epsilon argument
+ unsigned int idx = num_arguments_per_1D_tensor() * 3;
+ _kernel.setArg<cl_uint>(idx, _epsilon);
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLL2NormalizeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window window_sum(window);
+ window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ Window in_slice = window.first_slice_window_1D();
+ Window sum_slice = window_sum.first_slice_window_1D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, in_slice);
+ add_1D_tensor_argument(idx, _sum, sum_slice);
+ add_1D_tensor_argument(idx, _output, in_slice);
+ enqueue(queue, *this, in_slice);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
index 794a1bc..508fb89 100644
--- a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
@@ -101,7 +101,7 @@
Window slice = window.first_slice_window_2D();
Window matrix_b_window;
- matrix_b_window.use_tensor_dimensions(_input1->info());
+ matrix_b_window.use_tensor_dimensions(_input1->info()->tensor_shape());
Window slice_matrix_b = matrix_b_window.first_slice_window_3D();
do
diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
index b0b748f..1bf831b 100644
--- a/src/core/CL/kernels/CLMeanStdDevKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
@@ -40,10 +40,15 @@
using namespace arm_compute;
CLMeanStdDevKernel::CLMeanStdDevKernel()
- : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr)
+ : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _border_size(0)
{
}
+BorderSize CLMeanStdDevKernel::border_size() const
+{
+ return _border_size;
+}
+
void CLMeanStdDevKernel::configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
{
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
@@ -83,6 +88,8 @@
constexpr unsigned int num_elems_processed_per_iteration_x = 8;
const unsigned int num_elems_processed_per_iteration_y = input->info()->dimension(1);
+ _border_size = BorderSize(ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) - input->info()->dimension(0));
+
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
update_window_and_padding(win, input_access);
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
new file mode 100644
index 0000000..9b4533b
--- /dev/null
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <climits>
+
+using namespace arm_compute;
+
+CLMinMaxLayerKernel::CLMinMaxLayerKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ output_shape.set(Window::DimX, 2);
+ output_shape.remove_dimension(1);
+ output_shape.remove_dimension(1);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+ _input = input;
+ _output = output;
+
+ const unsigned int num_elems_processed_per_iteration = 1;
+
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.emplace("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmax_layer", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowStatic output_access(output->info(), 0, 0, 2, output->info()->dimension(1));
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLMinMaxLayerKernel::reset(cl::CommandQueue &queue)
+{
+ _output->map(queue, true);
+
+ Window window_output;
+ window_output.use_tensor_dimensions(_output->info()->tensor_shape());
+ window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
+ window_output.collapse_if_possible(ICLKernel::window(), 1);
+
+ Iterator output(_output, window_output);
+
+ // Reset output
+ execute_window_loop(window_output, [&](const Coordinates & id)
+ {
+ auto *ptr = reinterpret_cast<float *>(output.ptr());
+ ptr[0] = std::numeric_limits<float>::max();
+ ptr[1] = std::numeric_limits<float>::min();
+ },
+ output);
+
+ _output->unmap(queue);
+}
+
+void CLMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ // Collapse min/max batches
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
+ Window slice = window_collapsed.first_slice_window_3D();
+ slice.set(Window::DimX, Window::Dimension(0, 1, 1));
+ slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+ slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ Window window_output;
+ window_output.use_tensor_dimensions(_output->info()->tensor_shape());
+ window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
+ window_output.collapse_if_possible(ICLKernel::window(), 1);
+
+ Window output_slice = window_output.first_slice_window_1D();
+
+ do
+ {
+ unsigned int idx = 0;
+ // Set inputs
+ add_3D_tensor_argument(idx, _input, slice);
+ add_1D_tensor_argument(idx, _output, output_slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_3D(slice) && window_output.slide_window_slice_1D(output_slice));
+}
diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
index 939a53b..5636592 100644
--- a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
@@ -32,7 +32,27 @@
#include <climits>
-using namespace arm_compute;
+namespace arm_compute
+{
+inline int32_t FloatFlip(float val)
+{
+ static_assert(sizeof(float) == sizeof(int32_t), "Float must be same size as int32_t");
+ int32_t int_val = 0;
+
+ memcpy(&int_val, &val, sizeof(float));
+ int_val = (int_val >= 0) ? int_val : int_val ^ 0x7FFFFFFF;
+ return int_val;
+}
+
+inline float IFloatFlip(int32_t val)
+{
+ static_assert(sizeof(float) == sizeof(int32_t), "Float must be same size as int32_t");
+ float flt_val = 0.f;
+
+ val = (val >= 0) ? val : val ^ 0x7FFFFFFF;
+ memcpy(&flt_val, &val, sizeof(float));
+ return flt_val;
+}
CLMinMaxKernel::CLMinMaxKernel()
: _input(nullptr), _min_max(), _data_type_max_min()
@@ -41,7 +61,7 @@
void CLMinMaxKernel::configure(const ICLImage *input, cl::Buffer *min_max)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
ARM_COMPUTE_ERROR_ON(min_max == nullptr);
@@ -59,16 +79,33 @@
_data_type_max_min[0] = SHRT_MAX;
_data_type_max_min[1] = SHRT_MIN;
break;
+ case DataType::F32:
+ _data_type_max_min[0] = FloatFlip(std::numeric_limits<float>::max());
+ _data_type_max_min[1] = FloatFlip(std::numeric_limits<float>::lowest());
+ break;
default:
ARM_COMPUTE_ERROR("You called with the wrong image data types");
}
// Set kernel build options
- std::set<std::string> build_opts;
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DDATA_TYPE_MAX=" + val_to_string<int>(_data_type_max_min[0]));
- build_opts.emplace("-DDATA_TYPE_MIN=" + val_to_string<int>(_data_type_max_min[1]));
- build_opts.emplace((0 != (num_elems_processed_per_iteration % max_cl_vector_width)) ? "-DNON_MULTIPLE_OF_16" : "");
+ std::set<std::string> build_opts{ "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()) };
+
+ if(num_elems_processed_per_iteration % max_cl_vector_width != 0)
+ {
+ build_opts.emplace("-DNON_MULTIPLE_OF_16");
+ }
+
+ if(input->info()->data_type() == DataType::F32)
+ {
+ build_opts.emplace("-DDATA_TYPE_MAX=" + support::cpp11::to_string(std::numeric_limits<float>::max()));
+ build_opts.emplace("-DDATA_TYPE_MIN=" + support::cpp11::to_string(std::numeric_limits<float>::lowest()));
+ build_opts.emplace("-DIS_DATA_TYPE_FLOAT");
+ }
+ else
+ {
+ build_opts.emplace("-DDATA_TYPE_MAX=" + support::cpp11::to_string(_data_type_max_min[0]));
+ build_opts.emplace("-DDATA_TYPE_MIN=" + support::cpp11::to_string(_data_type_max_min[1]));
+ }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmax", build_opts));
@@ -76,11 +113,11 @@
// Set fixed arguments
unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
_kernel.setArg(idx++, *_min_max);
- _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+ _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(input->info()->dimension(0)));
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+ update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, ceil_to_multiple(num_elems_processed_per_iteration, 16)));
ICLKernel::configure(win);
}
@@ -100,6 +137,28 @@
enqueue(queue, *this, slice);
}
while(window.slide_window_slice_2D(slice));
+
+ cl_int min = 0;
+ cl_int max = 0;
+ queue.enqueueReadBuffer(*_min_max, CL_TRUE /* blocking */, 0 * sizeof(cl_int), sizeof(cl_int), static_cast<int *>(&min));
+ queue.enqueueReadBuffer(*_min_max, CL_TRUE /* blocking */, 1 * sizeof(cl_int), sizeof(cl_int), static_cast<int *>(&max));
+
+ if(_input->info()->data_type() == DataType::F32)
+ {
+ std::array<float, 2> min_max =
+ {
+ {
+ IFloatFlip(min),
+ IFloatFlip(max)
+ }
+ };
+ queue.enqueueWriteBuffer(*_min_max, CL_TRUE /* blocking */, 0, min_max.size() * sizeof(float), min_max.data());
+ }
+ else
+ {
+ std::array<int32_t, 2> min_max = { { min, max } };
+ queue.enqueueWriteBuffer(*_min_max, CL_TRUE /* blocking */, 0, min_max.size() * sizeof(int32_t), min_max.data());
+ }
}
CLMinMaxLocationKernel::CLMinMaxLocationKernel()
@@ -109,7 +168,7 @@
void CLMinMaxLocationKernel::configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc, ICLCoordinates2DArray *max_loc)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
ARM_COMPUTE_ERROR_ON(min_max == nullptr);
ARM_COMPUTE_ERROR_ON(min_max_count == nullptr && min_loc == nullptr && max_loc == nullptr);
@@ -123,6 +182,10 @@
build_opts.emplace((min_max_count != nullptr) ? "-DCOUNT_MIN_MAX" : "");
build_opts.emplace((min_loc != nullptr) ? "-DLOCATE_MIN" : "");
build_opts.emplace((max_loc != nullptr) ? "-DLOCATE_MAX" : "");
+ if(input->info()->data_type() == DataType::F32)
+ {
+ build_opts.emplace("-DIS_DATA_TYPE_FLOAT");
+ }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmaxloc", build_opts));
@@ -167,3 +230,4 @@
}
while(window.slide_window_slice_2D(slice));
}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 106a511..a744739 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
@@ -35,7 +36,7 @@
using namespace arm_compute;
CLNormalizationLayerKernel::CLNormalizationLayerKernel()
- : _input(nullptr), _squared_input(nullptr), _output(nullptr), _border_size(0)
+ : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false)
{
}
@@ -44,48 +45,61 @@
return _border_size;
}
-void CLNormalizationLayerKernel::configure(const ICLTensor *input, const ICLTensor *squared_input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented");
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
+ ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
+ ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
+ }
+
+ _input = input;
+ _output = output;
+
+ _is_in_map = (norm_info.type() != NormType::CROSS_MAP);
+ const unsigned int border_width = _is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
+ _border_size = BorderSize(0, border_width);
+
+ const unsigned int num_elems_processed_per_iteration = (is_data_type_fixed_point(input->info()->data_type())) ? 16 : 4;
+ const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
// Set build options
std::set<std::string> build_opts;
build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-
- _input = input;
- _squared_input = squared_input;
- _output = output;
-
- const bool is_in_map = (norm_info.type() == NormType::IN_MAP_1D);
- const unsigned int border_width = is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
- _border_size = BorderSize(0, border_width);
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
+ }
+ build_opts.emplace(("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
+ build_opts.emplace(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
+ build_opts.emplace(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
+ build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ build_opts.emplace(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
+ build_opts.emplace(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
// Create kernel
std::string kernel_name = (norm_info.type() == NormType::IN_MAP_1D) ? "normalization_layer_in_map_1D" : "normalization_layer_cross_map";
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
- // Set kernel static arguments
- unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
- _kernel.setArg<cl_float>(idx++, norm_info.scale_coeff());
- _kernel.setArg<cl_float>(idx++, norm_info.beta());
- _kernel.setArg<cl_float>(idx++, norm_info.kappa());
- _kernel.setArg<cl_uint>(idx++, norm_info.norm_size() / 2);
-
// Configure kernel window
- const unsigned int num_elems_processed_per_iteration = (is_in_map) ? 4 : 1;
- const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
-
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration);
- AccessWindowHorizontal squared_input_access(squared_input->info(), -_border_size.left, num_elems_read_per_iteration);
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win, input_access, squared_input_access, output_access);
+ update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, input->info()->valid_region());
@@ -97,15 +111,16 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- Window slice = window.first_slice_window_3D();
+ const int collapsed_dimension = _is_in_map ? Window::DimZ : 4;
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), collapsed_dimension);
+ Window slice = window_collapsed.first_slice_window_3D();
do
{
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _squared_input, slice);
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice);
}
- while(window.slide_window_slice_3D(slice));
+ while(window_collapsed.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index 84eb434..33c8b81 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -48,12 +48,36 @@
void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+ if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output->info(), Format::S16);
+ }
+ else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
+ }
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
"Output can only be U8 if both inputs are U8");
ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. ");
+ if(is_data_type_fixed_point(input1->info()->data_type()))
+ {
+ // All data types must be all QS8 or all QS16
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input1, input2, output);
+ ARM_COMPUTE_ERROR_ON_MSG(scale != 1, "Unsupported scaling factor for QS8/QS16. Scale must be 1.");
+ }
_input1 = input1;
_input2 = input2;
@@ -79,13 +103,28 @@
if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type()))
{
scale_int = -1;
- compute_type = (DataType::F32 == input1->info()->data_type() || DataType::F32 == input2->info()->data_type()) ? "float" : "half";
+ compute_type = (input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) ? "float" : "half";
data_type = "DATA_TYPE_FLOAT";
}
else
{
- compute_type = (DataType::S16 == input1->info()->data_type() || DataType::S16 == input2->info()->data_type()) ? "int" : "ushort";
- data_type = "DATA_TYPE_INT";
+ if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+ {
+ compute_type = "int";
+ }
+ else if(input1->info()->data_type() == DataType::QS8)
+ {
+ compute_type = "qs8";
+ }
+ else if(input1->info()->data_type() == DataType::QS16)
+ {
+ compute_type = "qs16";
+ }
+ else
+ {
+ compute_type = "ushort";
+ }
+ data_type = "DATA_TYPE_INT";
}
// Construct kernel name
@@ -96,6 +135,10 @@
std::set<std::string> build_opts;
build_opts.emplace((overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte");
+ if(is_data_type_fixed_point(input1->info()->data_type()))
+ {
+ build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input1->info()->fixed_point_position()));
+ }
build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
@@ -106,7 +149,7 @@
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
// Set scale argument
- unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the inputs and output parameters
+ unsigned int idx = 3 * num_arguments_per_3D_tensor(); //Skip the inputs and output parameters
if(scale_int >= 0)
{
@@ -140,15 +183,15 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window slice = window.first_slice_window_2D();
+ Window slice = window.first_slice_window_3D();
do
{
unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input1, slice);
- add_2D_tensor_argument(idx, _input2, slice);
- add_2D_tensor_argument(idx, _output, slice);
+ add_3D_tensor_argument(idx, _input1, slice);
+ add_3D_tensor_argument(idx, _input2, slice);
+ add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice);
}
- while(window.slide_window_slice_2D(slice));
+ while(window.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index dc5ae4e..497e87b 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -41,7 +41,7 @@
using namespace arm_compute;
CLPoolingLayerKernel::CLPoolingLayerKernel()
- : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0)
+ : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1)
{
}
@@ -52,103 +52,126 @@
void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
{
- int pool_pad_x = 0;
- int pool_pad_y = 0;
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- unsigned int pooled_w = 0;
- unsigned int pooled_h = 0;
- const PoolingType pool_type = pool_info.pool_type();
- const int pool_size = pool_info.pool_size();
- const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
- DimensionRoundingType pool_round = pad_stride_info.round();
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ unsigned int pooled_w = 0;
+ unsigned int pooled_h = 0;
+ const PoolingType pool_type = pool_info.pool_type();
+ const int pool_size = pool_info.pool_size();
+ const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
+ ARM_COMPUTE_ERROR_ON(pool_size > 7 && is_data_type_fixed_point(input->info()->data_type()));
// Check output dimensions
std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
input->info()->dimension(1),
pool_size,
- pool_stride_x, pool_stride_y,
- pool_pad_x, pool_pad_y,
- pool_round);
- ARM_COMPUTE_UNUSED(pooled_w);
- ARM_COMPUTE_UNUSED(pooled_h);
+ pool_size,
+ pool_info.pad_stride_info());
+
+ // Output auto initialization if not yet initialized
+ {
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ output_shape.set(0, pooled_w);
+ output_shape.set(1, pooled_h);
+
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+ }
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
- const int input_width = input->info()->dimension(0);
- const int input_height = input->info()->dimension(1);
- const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
- const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+ const int input_width = input->info()->dimension(0);
+ const int input_height = input->info()->dimension(1);
// Set instance variables
- _input = input;
- _output = output;
- _pool_info = pool_info;
- _border_size = BorderSize(pool_pad_y, pool_pad_x);
- _border_size.right = std::max(upper_bound_w, pool_pad_x);
- _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+ _input = input;
+ _output = output;
+ _pool_info = pool_info;
+ _border_size = BorderSize(pool_pad_y, pool_pad_x);
// Set build options
std::set<std::string> build_opts;
build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(("-DPOOL_" + ((PoolingType::MAX == pool_type) ? std::string("MAX") : std::string("AVG"))));
+ build_opts.emplace(("-DPOOL_" + string_from_pooling_type(pool_type)));
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+ }
+
+ build_opts.emplace(("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x)));
+ if(pool_type != PoolingType::MAX)
+ {
+ build_opts.emplace(("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + pool_pad_x)));
+ build_opts.emplace(("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + pool_pad_y)));
+ build_opts.emplace(("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y)));
+ build_opts.emplace(("-DPAD_X=" + support::cpp11::to_string(pool_pad_x)));
+ build_opts.emplace(("-DPAD_Y=" + support::cpp11::to_string(pool_pad_y)));
+ }
// Create kernel
- std::string kernel_name = "pooling_layer_" + val_to_string(pool_size);
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
-
- // Set static kernel arguments
- if(pool_type == PoolingType::AVG)
+ if(pool_size <= 7)
{
- // Create static kernel arguments
- const cl_int2 max_dims =
- {
- {
- static_cast<cl_int>(input->info()->dimension(0)) + pool_pad_x,
- static_cast<cl_int>(input->info()->dimension(1)) + pool_pad_y,
- }
- };
- const cl_int2 strides =
- {
- {
- pool_stride_x,
- pool_stride_y,
- }
- };
- const cl_int2 paddings =
- {
- {
- pool_pad_x,
- pool_pad_y,
- }
- };
+ // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
+ // each thread computes 4 output elements
+ const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(input->info()->data_type());
- // Set static kernel arguments
- unsigned int idx = 2 * num_arguments_per_3D_tensor();
- _kernel.setArg<cl_int2>(idx++, max_dims);
- _kernel.setArg<cl_int2>(idx++, strides);
- _kernel.setArg<cl_int2>(idx++, paddings);
+ int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size;
+ if(is_pool3x3_stride_le3)
+ {
+ // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3
+ _num_elems_processed_per_iteration = 4;
+ num_elements_read_per_iteration = pool_size * (pool_stride_x + 1);
+ }
+
+ const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width;
+ const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+ _border_size.right = std::max(upper_bound_w, pool_pad_x);
+ _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+
+ std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size);
+ if(is_pool3x3_stride_le3)
+ {
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name + "_optimized", build_opts));
+ }
+ else
+ {
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+ }
+ }
+ else // Run general case
+ {
+ _num_elems_processed_per_iteration = 1;
+
+ const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
+ const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+ _border_size.right = std::max(upper_bound_w, pool_pad_x);
+ _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+
+ build_opts.emplace(("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size)));
+ if(input->info()->data_type() == DataType::F16)
+ {
+ build_opts.emplace("-DFP16");
+ }
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("pooling_layer_N", build_opts));
}
// Configure kernel window
- const unsigned int num_elems_processed_per_iteration = 1;
-
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
+ Window win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration));
AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
+ AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
update_window_and_padding(win, input_access, output_access);
-
output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
ICLKernel::configure(win);
}
@@ -161,13 +184,14 @@
std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
- Window slice = window.first_slice_window_3D();
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = window_collapsed.first_slice_window_3D();
do
{
// Upsample input by pool size
Window in_slice(slice);
- in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x));
+ in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x * _num_elems_processed_per_iteration));
in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
// Set inputs
@@ -176,5 +200,5 @@
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice);
}
- while(window.slide_window_slice_3D(slice));
+ while(window_collapsed.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
new file mode 100644
index 0000000..4756443
--- /dev/null
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLQuantizationLayerKernel::CLQuantizationLayerKernel()
+ : _input(nullptr), _output(nullptr), _min_max(nullptr)
+{
+}
+
+void CLQuantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *min_max)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output, min_max);
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::U8, 0);
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+ _input = input;
+ _output = output;
+ _min_max = min_max;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("quantization_layer"));
+
+ // Configure window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowStatic min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+
+ // Update window and padding
+ update_window_and_padding(win, input_access, output_access, min_max_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLQuantizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
+ Window slice = window_collapsed.first_slice_window_3D();
+
+ Window window_min_max;
+ window_min_max.use_tensor_dimensions(_min_max->info()->tensor_shape());
+ window_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
+ window_min_max.collapse_if_possible(ICLKernel::window(), 1);
+
+ Window slice_min_max = window_min_max.first_slice_window_1D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ add_1D_tensor_argument(idx, _min_max, slice_min_max);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_3D(slice) && window_min_max.slide_window_slice_1D(slice_min_max));
+}
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
new file mode 100644
index 0000000..4e000c6
--- /dev/null
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
+ : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
+{
+}
+
+void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, rois, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
+ ARM_COMPUTE_ERROR_ON(rois->num_values() == 0);
+
+ // Output auto inizialitation if not yet initialized
+ TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values());
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(rois->num_values() != output->info()->dimension(3));
+
+ // Set instance variables
+ _input = input;
+ _rois = rois;
+ _output = output;
+ _pool_info = pool_info;
+
+ // Set build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type())));
+ build_opts.emplace(("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX))));
+ build_opts.emplace(("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY))));
+ build_opts.emplace(("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ))));
+ build_opts.emplace(("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width())));
+ build_opts.emplace(("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height())));
+ build_opts.emplace(("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale())));
+
+ // Create kernel
+ std::string kernel_name = "roi_pooling_layer";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+ // Set static kernel arguments
+ unsigned int idx = 2 * num_arguments_per_3D_tensor() + num_arguments_per_1D_array();
+ add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
+ add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration = 1;
+ Window window = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowStatic input_access(input->info(),
+ input->info()->valid_region().start(0),
+ input->info()->valid_region().start(1),
+ input->info()->valid_region().end(0),
+ input->info()->valid_region().end(1));
+ AccessWindowStatic output_access(output->info(), 0, 0, pool_info.pooled_width(), pool_info.pooled_height());
+
+ update_window_and_padding(window, input_access, output_access);
+ output_access.set_valid_region(window, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ ICLKernel::configure(window);
+}
+
+void CLROIPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice(window);
+ // Parallelize spatially and across the fourth dimension of the output tensor (also across ROIArray)
+ slice.set(Window::DimZ, window[3]);
+
+ // Set arguments
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_1D_array_argument<ROI>(idx, _rois, Strides(sizeof(ROI)), 1U, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+}
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
new file mode 100644
index 0000000..18a8e35
--- /dev/null
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLReductionOperationKernel::CLReductionOperationKernel()
+ : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE), _border_size()
+{
+}
+
+BorderSize CLReductionOperationKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+ // Output tensor auto initialization if not yet initialized
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ output_shape.set(axis, 1);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+
+ const unsigned int num_elems_processed_per_iteration = 16;
+ const unsigned int border_width = ((input->info()->dimension(0) % 128) != 0) ? 128 - input->info()->dimension(0) % 128 : 0;
+
+ _input = input;
+ _output = output;
+ _reduction_axis = axis;
+ _op = op;
+ _lws_hint = cl::NDRange(8);
+ _border_size = BorderSize(0, border_width, 0, 0);
+
+ // Set build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+ }
+
+ switch(op)
+ {
+ case ReductionOperation::SUM_SQUARE:
+ build_opts.emplace(("-DOPERATION=square_sum"));
+ break;
+ case ReductionOperation::SUM:
+ build_opts.emplace(("-DOPERATION=sum"));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported reduction operation");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reduction_operation", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowStatic input_access(input->info(), 0, 0, input->info()->dimension(0) + border_width, 1);
+ AccessWindowHorizontal output_access(output->info(), 0, 1);
+
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, output->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ // Set out window
+ Window out_window(window);
+ out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ // Get first input and output slices
+ Window in_slice = window.first_slice_window_1D();
+ Window out_slice = out_window.first_slice_window_1D();
+
+ // Reshape window
+ const unsigned int border_width = ((in_slice.x().end() % 128) != 0) ? 128 - in_slice.x().end() % 128 : 0;
+ in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
+
+ // Set local sums buffer
+ unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size();
+ _kernel.setArg(num_arguments_per_1D_tensor() * 2, local_sum_size, nullptr);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, in_slice);
+ add_1D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice, _lws_hint);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLReshapeLayerKernel.cpp b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
new file mode 100644
index 0000000..0131bd3
--- /dev/null
+++ b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <string>
+
+using namespace arm_compute;
+
+CLReshapeLayerKernel::CLReshapeLayerKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLReshapeLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size());
+
+ _input = input;
+ _output = output;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+ // Create kernel
+ std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()) };
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_layer", build_opts));
+
+ // Add static arguments
+ const cl_int2 input_shape =
+ {
+ {
+ static_cast<cl_int>(_input->info()->tensor_shape()[0]),
+ static_cast<cl_int>(_input->info()->tensor_shape()[1])
+ }
+ };
+ const cl_int2 output_shape =
+ {
+ {
+ static_cast<cl_int>(_output->info()->tensor_shape()[0]),
+ static_cast<cl_int>(_output->info()->tensor_shape()[1])
+ }
+ };
+ unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+ _kernel.setArg<cl_int2>(idx++, input_shape);
+ _kernel.setArg<cl_int2>(idx++, output_shape);
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowStatic output_access(output->info(), 0, 0, output->info()->tensor_shape().x(), output->info()->tensor_shape().y());
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLReshapeLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = window_collapsed.first_slice_window_3D();
+
+ // Set inputs
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, window_collapsed);
+ add_3D_tensor_argument(idx, _output, window_collapsed);
+ enqueue(queue, *this, slice);
+}
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index d74e837..82ebe64 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -46,9 +46,10 @@
void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON(output == input);
_input = input;
_output = output;
@@ -76,24 +77,33 @@
// Configure kernel window
constexpr unsigned int num_elems_processed_per_iteration = 4;
- const int border_offset = (border_undefined) ? 0 : border_size().left;
Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input_access(input->info(), -border_offset, -border_offset,
- input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset);
+ const ValidRegion &input_valid_region = input->info()->valid_region();
+
+ // Reads can occur within the valid region of the input
+ AccessWindowStatic input_access(input->info(),
+ input_valid_region.anchor[0] - border_size().left, input_valid_region.anchor[1] - border_size().top,
+ input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
+ input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
+
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()), output->info()->tensor_shape(), policy, border_size(),
+ border_undefined));
ICLKernel::configure(win);
// Set static kernel arguments
+ const float scale_x = static_cast<float>(input->info()->dimension(0)) / output->info()->dimension(0);
+ const float scale_y = static_cast<float>(input->info()->dimension(1)) / output->info()->dimension(1);
+
unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
_kernel.setArg<float>(idx++, input->info()->dimension(0));
_kernel.setArg<float>(idx++, input->info()->dimension(1));
- _kernel.setArg<float>(idx++, output->info()->dimension(0));
- _kernel.setArg<float>(idx++, output->info()->dimension(1));
+ _kernel.setArg<float>(idx++, scale_x);
+ _kernel.setArg<float>(idx++, scale_y);
}
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 0470d52..da3b942 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -41,9 +41,19 @@
void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+ // Softmax across the x dimension
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ output_shape.set(0, 1);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
_input = input;
_output = output;
@@ -52,7 +62,16 @@
const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
// Set build options
- std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
+ }
+ else if(input->info()->data_type() == DataType::F16)
+ {
+ build_opts.emplace("-DUSE_F16");
+ }
// Tell the kernel that the width is not a multiple of 16
if((input->info()->dimension(0) % max_cl_vector_width) != 0)
@@ -64,7 +83,7 @@
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_max", build_opts));
// Set fixed arguments
- unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
_kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
// Configure kernel window
@@ -88,11 +107,17 @@
void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output, max, sum);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
_input = input;
_max = max;
@@ -103,7 +128,16 @@
const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
// Set build options
- std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
+ }
+ else if(input->info()->data_type() == DataType::F16)
+ {
+ build_opts.emplace("-DUSE_F16");
+ }
// Tell the kernel that the width is not a multiple of 16
if((input->info()->dimension(0) % max_cl_vector_width) != 0)
@@ -115,7 +149,7 @@
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
// Set fixed arguments
- unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
_kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
// Configure window
@@ -139,19 +173,20 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- Window slice = window.first_slice_window_2D();
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = window_collapsed.first_slice_window_3D();
do
{
unsigned int idx = 0;
// Set inputs
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument(idx, _max, slice);
- add_2D_tensor_argument(idx, _output, slice);
- add_2D_tensor_argument(idx, _sum, slice);
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _max, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ add_3D_tensor_argument(idx, _sum, slice);
enqueue(queue, *this, slice);
}
- while(window.slide_window_slice_2D(slice));
+ while(window_collapsed.slide_window_slice_3D(slice));
}
CLLogits1DNormKernel::CLLogits1DNormKernel()
@@ -161,10 +196,15 @@
void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(sum, output);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
_input = input;
_sum = sum;
@@ -172,7 +212,11 @@
// Set build options
std::set<std::string> build_opts;
- build_opts.emplace(("-DUSE_" + string_from_data_type(input->info()->data_type())));
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
+ }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts));
@@ -198,7 +242,8 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- Window slice = window.first_slice_window_2D();
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = window_collapsed.first_slice_window_3D();
do
{
@@ -207,10 +252,10 @@
unsigned int idx = 0;
// Set inputs
- add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument(idx, _sum, sum_slice);
- add_2D_tensor_argument(idx, _output, slice);
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _sum, sum_slice);
+ add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice);
}
- while(window.slide_window_slice_2D(slice));
+ while(window_collapsed.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index 2ee6fcb..75d31d5 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp
@@ -40,8 +40,9 @@
void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON(output == nullptr);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
TensorShape output_shape{ input->info()->tensor_shape() };
const size_t w_out = input->info()->dimension(1);
@@ -52,8 +53,9 @@
// Output tensor auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
_input = input;
_output = output;
diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
index fddb580..a47952f 100644
--- a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
+++ b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
@@ -88,8 +88,8 @@
Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowStatic output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+ AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, input->info()->dimension(0) + border_size().right, input->info()->dimension(1) + border_size().bottom);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
update_window_and_padding(win, input_access, output_access);
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
index 018f272..bc27477 100644
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -34,31 +34,40 @@
using namespace arm_compute;
-CLWeightsReshapeKernel::CLWeightsReshapeKernel(bool is_shared)
- : _is_shared(is_shared), _input(nullptr), _biases(nullptr), _output(nullptr)
+CLWeightsReshapeKernel::CLWeightsReshapeKernel()
+ : _input(nullptr), _biases(nullptr), _output(nullptr)
{
}
void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- if(_is_shared)
- {
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(4) != (output->info()->dimension(2)));
- ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 5);
- ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 3);
- }
- else
- {
- ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4);
- ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2);
- }
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- // Check biases
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ output_shape.collapse(3);
+ const size_t tmp_dim = output_shape[0];
+ output_shape.set(0, output_shape[1]);
+ output_shape.set(1, tmp_dim + (biases != nullptr ? 1 : 0));
+
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, dt, fixed_point_position);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
if(biases != nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+ ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->num_dimensions() != 1));
+ ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->num_dimensions() != 2));
+ ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3]));
+ ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3] || biases->info()->dimension(1) != input->info()->tensor_shape()[4]));
}
_biases = biases;
@@ -69,6 +78,10 @@
std::set<std::string> build_opts;
build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
build_opts.emplace(((biases != nullptr) ? "-DHAS_BIAS" : ""));
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+ }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts));
@@ -88,49 +101,13 @@
ICLKernel::configure(win);
}
-CLConvolutionLayerWeightsReshapeKernel::CLConvolutionLayerWeightsReshapeKernel()
- : CLWeightsReshapeKernel(false)
-{
-}
-
-void CLConvolutionLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
Window out_window;
- out_window.use_tensor_dimensions(_output->info());
-
- Window in_slice = window.first_slice_window_3D();
- Window out_slice = out_window.first_slice_window_2D();
-
- // Set arguments
- unsigned idx = 0;
- add_3D_tensor_argument(idx, _input, in_slice);
- add_2D_tensor_argument(idx, _output, out_slice);
- if(_biases != nullptr)
- {
- Window biases_slice;
- biases_slice.set(Window::DimX, Window::Dimension(0, _biases->info()->tensor_shape().x(), 1));
- add_1D_tensor_argument(idx, _biases, biases_slice);
- }
-
- // Run kernel
- enqueue(queue, *this, in_slice);
-}
-
-CLLocallyConnectedLayerWeightsReshapeKernel::CLLocallyConnectedLayerWeightsReshapeKernel()
- : CLWeightsReshapeKernel(true)
-{
-}
-
-void CLLocallyConnectedLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window out_window;
- out_window.use_tensor_dimensions(_output->info());
+ out_window.use_tensor_dimensions(_output->info()->tensor_shape());
Window in_slice = window.first_slice_window_3D();
Window out_slice = out_window.first_slice_window_2D();
@@ -140,7 +117,7 @@
if(_biases != nullptr)
{
- biases_window.use_tensor_dimensions(_biases->info());
+ biases_window.use_tensor_dimensions(_biases->info()->tensor_shape());
biases_slice = biases_window.first_slice_window_1D();
}
diff --git a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
index 884da28..418d349 100644
--- a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
+++ b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
@@ -37,12 +37,12 @@
namespace
{
-inline void check_corner(float x, float y, float strength, InternalKeypoint *output, int32_t *num_corner_candidates, std::mutex *corner_candidates_mutex)
+inline void check_corner(float x, float y, float strength, InternalKeypoint *output, int32_t *num_corner_candidates, arm_compute::Mutex *corner_candidates_mutex)
{
if(strength != 0.0f)
{
/* Set index and update num_corner_candidate */
- std::unique_lock<std::mutex> lock(*corner_candidates_mutex);
+ std::unique_lock<arm_compute::Mutex> lock(*corner_candidates_mutex);
const int32_t idx = *num_corner_candidates;
@@ -55,12 +55,9 @@
}
}
-inline void corner_candidates(const float *__restrict input, InternalKeypoint *__restrict output, int32_t x, int32_t y, int32_t *num_corner_candidates, std::mutex *corner_candidates_mutex)
+inline void corner_candidates(const float *__restrict input, InternalKeypoint *__restrict output, int32_t x, int32_t y, int32_t *num_corner_candidates, arm_compute::Mutex *corner_candidates_mutex)
{
- check_corner(x + 0, y, *(input + 0), output, num_corner_candidates, corner_candidates_mutex);
- check_corner(x + 1, y, *(input + 1), output, num_corner_candidates, corner_candidates_mutex);
- check_corner(x + 2, y, *(input + 2), output, num_corner_candidates, corner_candidates_mutex);
- check_corner(x + 3, y, *(input + 3), output, num_corner_candidates, corner_candidates_mutex);
+ check_corner(x, y, *input, output, num_corner_candidates, corner_candidates_mutex);
}
} // namespace
@@ -86,7 +83,7 @@
_output = output;
_num_corner_candidates = num_corner_candidates;
- const unsigned int num_elems_processed_per_iteration = 4;
+ const unsigned int num_elems_processed_per_iteration = 1;
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
@@ -96,8 +93,9 @@
INEKernel::configure(win);
}
-void CPPCornerCandidatesKernel::run(const Window &window)
+void CPPCornerCandidatesKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Iterator input(_input, window);
diff --git a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
index 62bfdd6..ebe3db9 100644
--- a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
@@ -59,8 +59,9 @@
IKernel::configure(Window()); // Default 1 iteration window
}
-void CPPDetectionWindowNonMaximaSuppressionKernel::run(const Window &window)
+void CPPDetectionWindowNonMaximaSuppressionKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_input_output->buffer() == nullptr);
diff --git a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
index 09d3ccf..3b1c7ae 100644
--- a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
+++ b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
@@ -68,8 +68,9 @@
return false;
}
-void CPPSortEuclideanDistanceKernel::run(const Window &window)
+void CPPSortEuclideanDistanceKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICPPKernel::window(), window);
diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index ff903e9..fc0b6e9 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp
@@ -50,7 +50,7 @@
anchor[0] + border_size.left,
// Skip the border right of the image
// Make sure the window width is a multiple of the step size
- anchor[0] + border_size.left + ceil_to_multiple(shape[0] - border_size.left - border_size.right, steps[0]),
+ anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
steps[0]));
size_t n = 1;
@@ -62,7 +62,7 @@
// Skip the border above the image
anchor[1] + border_size.top,
// Skip the border below the image
- anchor[1] + border_size.top + ceil_to_multiple(shape[1] - border_size.top - border_size.bottom, steps[1]),
+ anchor[1] + border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
steps[1]));
++n;
@@ -137,7 +137,7 @@
anchor[0] + border_size.left,
// Skip the border right of the image
// Make sure the window width is a multiple of the step size
- anchor[0] + border_size.left + ceil_to_multiple(shape[0] - border_size.left - border_size.right, steps[0]),
+ anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
steps[0]));
size_t n = 1;
diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
index 4ddc0fe..693d851 100644
--- a/src/core/IAccessWindow.cpp
+++ b/src/core/IAccessWindow.cpp
@@ -213,8 +213,8 @@
PaddingSize padding;
padding.left = std::max(0, -min_x);
padding.right = std::max<int>(0, max_x - shape[0]);
- padding.top = shape.num_dimensions() == 1 ? 0 : std::max(0, -min_y);
- padding.bottom = shape.num_dimensions() == 1 ? 0 : std::max<int>(0, max_y - shape[1]);
+ padding.top = std::max(0, -min_y);
+ padding.bottom = std::max<int>(0, max_y - shape[1]);
// Update strides in tensor info
return _info->extend_padding(padding);
diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index 0b29eca..4a54675 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp
@@ -55,9 +55,9 @@
dst_info->set_valid_region(src_info->valid_region());
Window win_src;
- win_src.use_tensor_dimensions(src_info, Window::DimY);
+ win_src.use_tensor_dimensions(src_info->tensor_shape(), Window::DimY);
Window win_dst;
- win_dst.use_tensor_dimensions(dst_info, Window::DimY);
+ win_dst.use_tensor_dimensions(dst_info->tensor_shape(), Window::DimY);
Iterator src_it(&src, win_src);
Iterator dst_it(this, win_dst);
@@ -147,4 +147,4 @@
s << io_fmt.row_delim;
}
}
-}
\ No newline at end of file
+}
diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
index edb0a0f..e0c2891 100644
--- a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
@@ -201,8 +201,9 @@
INEKernel::configure(win);
}
-void NEAbsoluteDifferenceKernel::run(const Window &window)
+void NEAbsoluteDifferenceKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp
index e5b933a..deafabe 100644
--- a/src/core/NEON/kernels/NEAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEAccumulateKernel.cpp
@@ -114,8 +114,9 @@
}
} // namespace fp16
-void NEAccumulateWeightedFP16Kernel::run(const Window &window)
+void NEAccumulateWeightedFP16Kernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
@@ -131,7 +132,7 @@
},
input, accum);
}
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
namespace
{
@@ -262,8 +263,9 @@
INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
}
-void NEAccumulateKernel::run(const Window &window)
+void NEAccumulateKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
Iterator input(_input, window);
@@ -300,8 +302,9 @@
INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
}
-void NEAccumulateWeightedKernel::run(const Window &window)
+void NEAccumulateWeightedKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
@@ -342,8 +345,9 @@
INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
}
-void NEAccumulateSquaredKernel::run(const Window &window)
+void NEAccumulateSquaredKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
Iterator input(_input, window);
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index a878078..67fc45b 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -41,21 +41,29 @@
using namespace arm_compute;
NEActivationLayerKernel::NEActivationLayerKernel()
- : _func(nullptr), _act_info(ActivationFunction::LOGISTIC)
+ : _input(nullptr), _output(nullptr), _func(nullptr), _act_info(ActivationFunction::LOGISTIC)
{
}
-void NEActivationLayerKernel::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+ _input = input;
+ _act_info = activation_info;
+ _output = input;
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ if(output != nullptr)
+ {
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+ _output = output;
+ }
// Activation functions : FP32
static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 =
@@ -65,12 +73,31 @@
{ ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, float> },
{ ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, float> },
{ ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, float> },
+ { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, float> },
+ { ActivationFunction::LEAKY_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LEAKY_RELU, float> },
{ ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, float> },
{ ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, float> },
{ ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, float> },
{ ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, float> },
};
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ // Activation functions : FP16
+ static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f16 =
+ {
+ { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, float16_t> },
+ { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, float16_t> },
+ { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, float16_t> },
+ { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, float16_t> },
+ { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, float16_t> },
+ { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, float16_t> },
+ { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, float16_t> },
+ { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, float16_t> },
+ { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, float16_t> },
+ { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, float16_t> },
+ };
+#endif /* ARM_COMPUTE_ENABLE_FP16*/
+
// Activation functions : QS8
static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs8 =
{
@@ -79,32 +106,207 @@
{ ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qint8_t> },
{ ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qint8_t> },
{ ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qint8_t> },
+ { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qint8_t> },
+ { ActivationFunction::LEAKY_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LEAKY_RELU, qint8_t> },
{ ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, qint8_t> },
{ ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, qint8_t> },
{ ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint8_t> },
{ ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint8_t> },
};
+ // Activation functions : QS16
+ static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs16 =
+ {
+ { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, qint16_t> },
+ { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, qint16_t> },
+ { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qint16_t> },
+ { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qint16_t> },
+ { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qint16_t> },
+ { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qint16_t> },
+ { ActivationFunction::LEAKY_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LEAKY_RELU, qint16_t> },
+ { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, qint16_t> },
+ { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, qint16_t> },
+ { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint16_t> },
+ { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint16_t> },
+ };
- _input = input;
- _output = output;
- _act_info = activation_info;
switch(input->info()->data_type())
{
- case DataType::F32:
- _func = act_map_f32[activation_info.activation()];
- break;
case DataType::QS8:
_func = act_map_qs8[activation_info.activation()];
break;
+ case DataType::QS16:
+ _func = act_map_qs16[activation_info.activation()];
+ break;
+ case DataType::F32:
+ _func = act_map_f32[activation_info.activation()];
+ break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ _func = act_map_f16[activation_info.activation()];
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
default:
ARM_COMPUTE_ERROR("Unsupported data type.");
}
constexpr unsigned int num_elems_processed_per_iteration = 16;
- INESimpleKernel::configure(_input, _output, num_elems_processed_per_iteration);
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ if(output != nullptr)
+ {
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win,
+ AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+ output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+ }
+ else
+ {
+ // In-place computation
+ update_window_and_padding(win,
+ AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+ }
+
+ ICPPKernel::configure(win);
}
+#ifdef ARM_COMPUTE_ENABLE_FP16
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, float16_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+{
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+
+ static const float16x8_t CONST_0 = vdupq_n_f16(0.f);
+ static const float16x8_t CONST_1 = vdupq_n_f16(1.f);
+
+ const float16x8_t a = vdupq_n_f16(_act_info.a());
+ const float16x8_t b = vdupq_n_f16(_act_info.b());
+
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+ const float16x8x2_t in = vld2q_f16(input_ptr);
+ float16x8x2_t tmp = { {} };
+
+ switch(F)
+ {
+ case ActivationFunction::ABS:
+ tmp =
+ {
+ {
+ vabsq_f16(in.val[0]),
+ vabsq_f16(in.val[1]),
+ }
+ };
+ break;
+ case ActivationFunction::BOUNDED_RELU:
+ tmp =
+ {
+ {
+ vminq_f16(a, vmaxq_f16(CONST_0, in.val[0])),
+ vminq_f16(a, vmaxq_f16(CONST_0, in.val[1]))
+ }
+ };
+ break;
+ case ActivationFunction::LU_BOUNDED_RELU:
+ tmp =
+ {
+ {
+ vminq_f16(a, vmaxq_f16(b, in.val[0])),
+ vminq_f16(a, vmaxq_f16(b, in.val[1]))
+ }
+ };
+ break;
+ case ActivationFunction::LINEAR:
+ tmp =
+ {
+ {
+ vaddq_f16(b, vmulq_f16(a, in.val[0])),
+ vaddq_f16(b, vmulq_f16(a, in.val[1]))
+ }
+ };
+ break;
+ case ActivationFunction::LOGISTIC:
+ tmp =
+ {
+ {
+ vinvq_f16(vaddq_f16(CONST_1, vexpq_f16(vnegq_f16(in.val[0])))),
+ vinvq_f16(vaddq_f16(CONST_1, vexpq_f16(vnegq_f16(in.val[1])))),
+ }
+ };
+ break;
+ case ActivationFunction::RELU:
+ tmp =
+ {
+ {
+ vmaxq_f16(CONST_0, in.val[0]),
+ vmaxq_f16(CONST_0, in.val[1])
+ }
+ };
+ break;
+ case ActivationFunction::LEAKY_RELU:
+ tmp =
+ {
+ {
+ vbslq_f16(vcgtq_f16(in.val[0], CONST_0), in.val[0], vmulq_f16(a, in.val[0])),
+ vbslq_f16(vcgtq_f16(in.val[1], CONST_0), in.val[1], vmulq_f16(a, in.val[1]))
+ }
+ };
+ break;
+ case ActivationFunction::SOFT_RELU:
+ tmp =
+ {
+ {
+ vlogq_f16(vaddq_f16(CONST_1, vexpq_f16(in.val[0]))),
+ vlogq_f16(vaddq_f16(CONST_1, vexpq_f16(in.val[1]))),
+ }
+ };
+ break;
+ case ActivationFunction::SQRT:
+ tmp =
+ {
+ {
+ vinvq_f16(vinvsqrtq_f16(in.val[0])),
+ vinvq_f16(vinvsqrtq_f16(in.val[1])),
+ }
+ };
+ break;
+ case ActivationFunction::SQUARE:
+ tmp =
+ {
+ {
+ vmulq_f16(in.val[0], in.val[0]),
+ vmulq_f16(in.val[1], in.val[1])
+ }
+ };
+ break;
+ case ActivationFunction::TANH:
+ tmp =
+ {
+ {
+ vmulq_f16(a, vtanhq_f16(vmulq_f16(b, in.val[0]))),
+ vmulq_f16(a, vtanhq_f16(vmulq_f16(b, in.val[1]))),
+ }
+ };
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ break;
+ }
+
+ vst2q_f16(output_ptr, tmp);
+ },
+ input, output);
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
template <ActivationLayerInfo::ActivationFunction F, typename T>
typename std::enable_if<std::is_same<T, float>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
{
@@ -137,17 +339,6 @@
}
};
break;
- case ActivationFunction::BOUNDED_RELU:
- tmp =
- {
- {
- vminq_f32(a, vmaxq_f32(CONST_0, in.val[0])),
- vminq_f32(a, vmaxq_f32(CONST_0, in.val[1])),
- vminq_f32(a, vmaxq_f32(CONST_0, in.val[2])),
- vminq_f32(a, vmaxq_f32(CONST_0, in.val[3])),
- }
- };
- break;
case ActivationFunction::LINEAR:
tmp =
{
@@ -181,6 +372,39 @@
}
};
break;
+ case ActivationFunction::BOUNDED_RELU:
+ tmp =
+ {
+ {
+ vminq_f32(a, vmaxq_f32(CONST_0, in.val[0])),
+ vminq_f32(a, vmaxq_f32(CONST_0, in.val[1])),
+ vminq_f32(a, vmaxq_f32(CONST_0, in.val[2])),
+ vminq_f32(a, vmaxq_f32(CONST_0, in.val[3])),
+ }
+ };
+ break;
+ case ActivationFunction::LU_BOUNDED_RELU:
+ tmp =
+ {
+ {
+ vminq_f32(a, vmaxq_f32(b, in.val[0])),
+ vminq_f32(a, vmaxq_f32(b, in.val[1])),
+ vminq_f32(a, vmaxq_f32(b, in.val[2])),
+ vminq_f32(a, vmaxq_f32(b, in.val[3])),
+ }
+ };
+ break;
+ case ActivationFunction::LEAKY_RELU:
+ tmp =
+ {
+ {
+ vbslq_f32(vcgtq_f32(in.val[0], CONST_0), in.val[0], vmulq_f32(a, in.val[0])),
+ vbslq_f32(vcgtq_f32(in.val[1], CONST_0), in.val[1], vmulq_f32(a, in.val[1])),
+ vbslq_f32(vcgtq_f32(in.val[2], CONST_0), in.val[2], vmulq_f32(a, in.val[2])),
+ vbslq_f32(vcgtq_f32(in.val[3], CONST_0), in.val[3], vmulq_f32(a, in.val[3])),
+ }
+ };
+ break;
case ActivationFunction::SOFT_RELU:
tmp =
{
@@ -237,14 +461,14 @@
template <ActivationLayerInfo::ActivationFunction F, typename T>
typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
{
- Iterator input(_input, window);
- Iterator output(_output, window);
- int fixed_point_position = _input->info()->fixed_point_position();
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+ const int fixed_point_position = _input->info()->fixed_point_position();
static const qint8x16_t CONST_0 = vdupq_n_qs8(0);
- const qint8x16_t CONST_1 = vdupq_n_qs8(scvt_qs8_f32(1.f, fixed_point_position));
- const qint8x16_t a = vdupq_n_qs8(scvt_qs8_f32(_act_info.a(), fixed_point_position));
- const qint8x16_t b = vdupq_n_qs8(scvt_qs8_f32(_act_info.b(), fixed_point_position));
+ const qint8x16_t CONST_1 = vdupq_n_qs8(sqcvt_qs8_f32(1.f, fixed_point_position));
+ const qint8x16_t a = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.a(), fixed_point_position));
+ const qint8x16_t b = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.b(), fixed_point_position));
execute_window_loop(window, [&](const Coordinates & id)
{
@@ -259,29 +483,35 @@
case ActivationFunction::ABS:
tmp = vqabsq_qs8(in);
break;
- case ActivationFunction::BOUNDED_RELU:
- tmp = vminq_qs8(a, vmaxq_qs8(CONST_0, in));
- break;
case ActivationFunction::LINEAR:
tmp = vqmlaq_qs8(b, a, in, fixed_point_position);
break;
case ActivationFunction::LOGISTIC:
- tmp = vrecipq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(vnegq_s8(in), fixed_point_position)), fixed_point_position);
+ tmp = vqrecipq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(vnegq_s8(in), fixed_point_position)), fixed_point_position);
break;
case ActivationFunction::RELU:
tmp = vmaxq_qs8(CONST_0, in);
break;
+ case ActivationFunction::BOUNDED_RELU:
+ tmp = vminq_qs8(a, vmaxq_qs8(CONST_0, in));
+ break;
+ case ActivationFunction::LU_BOUNDED_RELU:
+ tmp = vminq_qs8(a, vmaxq_qs8(b, in));
+ break;
+ case ActivationFunction::LEAKY_RELU:
+ tmp = vbslq_s8(vcgtq_s8(in, CONST_0), in, vmulq_qs8(a, in, fixed_point_position));
+ break;
case ActivationFunction::SOFT_RELU:
tmp = vlogq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(in, fixed_point_position)), fixed_point_position);
break;
case ActivationFunction::SQRT:
- tmp = vrecipq_qs8(vinvsqrtq_qs8(in, fixed_point_position), fixed_point_position);
+ tmp = vqrecipq_qs8(vqinvsqrtq_qs8(in, fixed_point_position), fixed_point_position);
break;
case ActivationFunction::SQUARE:
tmp = vqmulq_qs8(in, in, fixed_point_position);
break;
case ActivationFunction::TANH:
- tmp = vtanhq_qs8(in, fixed_point_position);
+ tmp = vqmulq_qs8(a, vqtanhq_qs8(vqmulq_qs8(b, in, fixed_point_position), fixed_point_position), fixed_point_position);
break;
default:
break;
@@ -292,10 +522,142 @@
input, output);
}
-void NEActivationLayerKernel::run(const Window &window)
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, qint16_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
{
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+ const int fixed_point_position = _input->info()->fixed_point_position();
+
+ static const qint16x8_t CONST_0 = vdupq_n_qs16(0);
+ const qint16x8_t CONST_1 = vdupq_n_qs16(sqcvt_qs16_f32(1.f, fixed_point_position));
+ const qint16x8_t a = vdupq_n_qs16(sqcvt_qs16_f32(_act_info.a(), fixed_point_position));
+ const qint16x8_t b = vdupq_n_qs16(sqcvt_qs16_f32(_act_info.b(), fixed_point_position));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+ const qint16x8x2_t in = vld2q_s16(input_ptr);
+ qint16x8x2_t tmp = { {} };
+
+ switch(F)
+ {
+ case ActivationFunction::ABS:
+ tmp =
+ {
+ {
+ vqabsq_qs16(in.val[0]),
+ vqabsq_qs16(in.val[1]),
+ }
+ };
+ break;
+ case ActivationFunction::LINEAR:
+ tmp =
+ {
+ {
+ vqmlaq_qs16(b, a, in.val[0], fixed_point_position),
+ vqmlaq_qs16(b, a, in.val[1], fixed_point_position),
+ }
+ };
+ break;
+ case ActivationFunction::LOGISTIC:
+ tmp =
+ {
+ {
+ vqrecipq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(vnegq_s16(in.val[0]), fixed_point_position)), fixed_point_position),
+ vqrecipq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(vnegq_s16(in.val[1]), fixed_point_position)), fixed_point_position),
+ }
+ };
+ break;
+ case ActivationFunction::RELU:
+ tmp =
+ {
+ {
+ vmaxq_qs16(CONST_0, in.val[0]),
+ vmaxq_qs16(CONST_0, in.val[1]),
+ }
+ };
+ break;
+ case ActivationFunction::BOUNDED_RELU:
+ tmp =
+ {
+ {
+ vminq_qs16(a, vmaxq_qs16(CONST_0, in.val[0])),
+ vminq_qs16(a, vmaxq_qs16(CONST_0, in.val[1])),
+ }
+ };
+ break;
+ case ActivationFunction::LU_BOUNDED_RELU:
+ tmp =
+ {
+ {
+ vminq_qs16(a, vmaxq_qs16(b, in.val[0])),
+ vminq_qs16(a, vmaxq_qs16(b, in.val[1])),
+ }
+ };
+ break;
+ case ActivationFunction::LEAKY_RELU:
+ tmp =
+ {
+ {
+ vbslq_s16(vcgtq_s16(in.val[0], CONST_0), in.val[0], vmulq_qs16(a, in.val[0], fixed_point_position)),
+ vbslq_s16(vcgtq_s16(in.val[1], CONST_0), in.val[1], vmulq_qs16(a, in.val[1], fixed_point_position)),
+ }
+ };
+ break;
+ case ActivationFunction::SOFT_RELU:
+ tmp =
+ {
+ {
+ vlogq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(in.val[0], fixed_point_position)), fixed_point_position),
+ vlogq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(in.val[1], fixed_point_position)), fixed_point_position),
+ }
+ };
+ break;
+ case ActivationFunction::SQRT:
+ tmp =
+ {
+ {
+ vqrecipq_qs16(vqinvsqrtq_qs16(in.val[0], fixed_point_position), fixed_point_position),
+ vqrecipq_qs16(vqinvsqrtq_qs16(in.val[1], fixed_point_position), fixed_point_position),
+ }
+ };
+ break;
+ case ActivationFunction::SQUARE:
+ tmp =
+ {
+ {
+ vqmulq_qs16(in.val[0], in.val[0], fixed_point_position),
+ vqmulq_qs16(in.val[1], in.val[1], fixed_point_position),
+ }
+ };
+ break;
+ case ActivationFunction::TANH:
+ tmp =
+ {
+ {
+ vqmulq_qs16(a, vqtanhq_qs16(vqmulq_qs16(b, in.val[0], fixed_point_position), fixed_point_position), fixed_point_position),
+ vqmulq_qs16(a, vqtanhq_qs16(vqmulq_qs16(b, in.val[1], fixed_point_position), fixed_point_position), fixed_point_position),
+ }
+ };
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Function not implemented");
+ break;
+ }
+
+ vst2q_qs16(output_ptr, tmp);
+ },
+ input, output);
+}
+
+void NEActivationLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
(this->*_func)(window);
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index a4fdad8..f263fd0 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
@@ -45,6 +46,38 @@
namespace
{
+void add_wrap_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ Iterator input1(in1, window);
+ Iterator input2(in2, window);
+ Iterator output(out, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
+ const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
+
+ vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vaddq_qs8(a, b));
+ },
+ input1, input2, output);
+}
+
+void add_saturate_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ Iterator input1(in1, window);
+ Iterator input2(in2, window);
+ Iterator output(out, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
+ const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
+
+ vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqaddq_qs8(a, b));
+ },
+ input1, input2, output);
+}
+
void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
Iterator input1(in1, window);
@@ -112,6 +145,45 @@
return res;
}
+#ifdef ARM_COMPUTE_ENABLE_FP16
+inline float16x8x2_t vadd2q_f16(const float16x8x2_t &a, const float16x8x2_t &b)
+{
+ const float16x8x2_t res =
+ {
+ {
+ vaddq_f16(a.val[0], b.val[0]),
+ vaddq_f16(a.val[1], b.val[1])
+ }
+ };
+
+ return res;
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+void add_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ Iterator input1(in1, window);
+ Iterator input2(in2, window);
+ Iterator output(out, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr()));
+ const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr()));
+
+ vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vadd2q_f16(a, b));
+ },
+ input1, input2, output);
+#else /* ARM_COMPUTE_ENABLE_FP16 */
+ ARM_COMPUTE_UNUSED(in1);
+ ARM_COMPUTE_UNUSED(in2);
+ ARM_COMPUTE_UNUSED(out);
+ ARM_COMPUTE_UNUSED(window);
+ ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
void add_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
Iterator input1(in1, window);
@@ -294,26 +366,40 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
- if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
- {
- set_format_if_unknown(*output->info(), Format::S16);
- }
- else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output->info(), Format::F32);
+ if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output->info(), Format::S16);
+ }
+ else if(input1->info()->data_type() == DataType::F16 || input2->info()->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output->info(), Format::F16);
+ }
+ else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
}
ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
"Output can only be U8 if both inputs are U8");
+ if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
+ {
+ // Check that all data types are the same and all fixed-point positions are the same
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
+ }
static std::map<std::string, AddFunction *> map_function =
{
+ { "add_wrap_QS8_QS8_QS8", &add_wrap_QS8_QS8_QS8 },
+ { "add_saturate_QS8_QS8_QS8", &add_saturate_QS8_QS8_QS8 },
{ "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 },
{ "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 },
{ "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 },
@@ -322,10 +408,15 @@
{ "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 },
{ "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 },
{ "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 },
+ { "add_wrap_QS16_QS16_QS16", &add_wrap_S16_S16_S16 },
+ { "add_saturate_QS16_QS16_QS16", &add_saturate_S16_S16_S16 },
{ "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 },
{ "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 },
{ "add_wrap_F32_F32_F32", &add_F32_F32_F32 },
{ "add_saturate_F32_F32_F32", &add_F32_F32_F32 },
+ { "add_wrap_F16_F16_F16", &add_F16_F16_F16 },
+ { "add_saturate_F16_F16_F16", &add_F16_F16_F16 },
+
};
_input1 = input1;
@@ -368,8 +459,9 @@
INEKernel::configure(win);
}
-void NEArithmeticAdditionKernel::run(const Window &window)
+void NEArithmeticAdditionKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index d3e62b0..85f72c1 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
@@ -44,6 +45,38 @@
namespace
{
+void sub_wrap_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ Iterator input1(in1, window);
+ Iterator input2(in2, window);
+ Iterator output(out, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
+ const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
+
+ vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vsubq_qs8(a, b));
+ },
+ input1, input2, output);
+}
+
+void sub_saturate_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ Iterator input1(in1, window);
+ Iterator input2(in2, window);
+ Iterator output(out, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
+ const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
+
+ vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqsubq_qs8(a, b));
+ },
+ input1, input2, output);
+}
+
void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
Iterator input1(in1, window);
@@ -124,6 +157,45 @@
input1, input2, output);
}
+#ifdef ARM_COMPUTE_ENABLE_FP16
+inline float16x8x2_t vsub2q_f16(const float16x8x2_t &a, const float16x8x2_t &b)
+{
+ const float16x8x2_t res =
+ {
+ {
+ vsubq_f16(a.val[0], b.val[0]),
+ vsubq_f16(a.val[1], b.val[1])
+ }
+ };
+
+ return res;
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+void sub_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ Iterator input1(in1, window);
+ Iterator input2(in2, window);
+ Iterator output(out, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr()));
+ const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr()));
+
+ vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vsub2q_f16(a, b));
+ },
+ input1, input2, output);
+#else /* ARM_COMPUTE_ENABLE_FP16 */
+ ARM_COMPUTE_UNUSED(in1);
+ ARM_COMPUTE_UNUSED(in2);
+ ARM_COMPUTE_UNUSED(out);
+ ARM_COMPUTE_UNUSED(window);
+ ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
void sub_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
Iterator input1(in1, window);
@@ -287,26 +359,40 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
- if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
- {
- set_format_if_unknown(*output->info(), Format::S16);
- }
- else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
- {
- set_format_if_unknown(*output->info(), Format::F32);
+ if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output->info(), Format::S16);
+ }
+ else if(input1->info()->data_type() == DataType::F16 || input2->info()->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output->info(), Format::F16);
+ }
+ else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
}
ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
"Output can only be U8 if both inputs are U8");
+ if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
+ {
+ // Check that all data types are the same and all fixed-point positions are the same
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
+ }
static std::map<std::string, SubFunction *> map_function =
{
+ { "sub_wrap_QS8_QS8_QS8", &sub_wrap_QS8_QS8_QS8 },
+ { "sub_saturate_QS8_QS8_QS8", &sub_saturate_QS8_QS8_QS8 },
{ "sub_wrap_U8_U8_U8", &sub_wrap_U8_U8_U8 },
{ "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 },
{ "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 },
@@ -315,10 +401,15 @@
{ "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 },
{ "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 },
{ "sub_saturate_S16_U8_S16", &sub_saturate_S16_U8_S16 },
+ { "sub_wrap_QS16_QS16_QS16", &sub_wrap_S16_S16_S16 },
+ { "sub_saturate_QS16_QS16_QS16", &sub_saturate_S16_S16_S16 },
{ "sub_wrap_S16_S16_S16", &sub_wrap_S16_S16_S16 },
{ "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 },
{ "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 },
{ "sub_saturate_F32_F32_F32", &sub_F32_F32_F32 },
+ { "sub_wrap_F16_F16_F16", &sub_F16_F16_F16 },
+ { "sub_saturate_F16_F16_F16", &sub_F16_F16_F16 },
+
};
_input1 = input1;
@@ -361,8 +452,9 @@
INEKernel::configure(win);
}
-void NEArithmeticSubtractionKernel::run(const Window &window)
+void NEArithmeticSubtractionKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 9a216ae..f6f6f9c 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -38,7 +38,7 @@
{
}
-void batch_normalization_q8(const ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+void batch_normalization_q8(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
{
Iterator input(in, window);
Iterator output(out, window);
@@ -47,7 +47,7 @@
// Only compute denominator and NEON vectors once per feature map.
int slice = -1;
- int fixed_point_position = in->info()->fixed_point_position();
+ const int fixed_point_position = in->info()->fixed_point_position();
const auto input_mean = reinterpret_cast<const qint8_t *>(mean->ptr_to_element(Coordinates(0, 0)));
const auto input_var = reinterpret_cast<const qint8_t *>(var->ptr_to_element(Coordinates(0, 0)));
const auto input_gamma = reinterpret_cast<const qint8_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
@@ -58,7 +58,7 @@
qint8x16_t gamma_vec = vdupq_n_qs8(0);
qint8x16_t beta_vec = vdupq_n_qs8(0);
qint8x16_t denominator = vdupq_n_qs8(0);
- const qint8x16_t epsilon_vec = vdupq_n_qs8(scvt_qs8_f32(epsilon, fixed_point_position));
+ const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(epsilon, fixed_point_position));
execute_window_loop(window, [&](const Coordinates & id)
{
if(slice != id.z())
@@ -82,7 +82,51 @@
input, output);
}
-void batch_normalization_fp32(const ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+void batch_normalization_q16(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+{
+ Iterator input(in, window);
+ Iterator output(out, window);
+
+ // Hold information about the current feature map we are iterating.
+ // Only compute denominator and NEON vectors once per feature map.
+ int slice = -1;
+
+ const int fixed_point_position = in->info()->fixed_point_position();
+ const auto input_mean = reinterpret_cast<const qint16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const qint16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma = reinterpret_cast<const qint16_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
+ const auto input_beta = reinterpret_cast<const qint16_t *>(beta->ptr_to_element(Coordinates(0, 0)));
+
+ qint16x8_t mean_vec = vdupq_n_qs16(0);
+ qint16x8_t var_vec = vdupq_n_qs16(0);
+ qint16x8_t gamma_vec = vdupq_n_qs16(0);
+ qint16x8_t beta_vec = vdupq_n_qs16(0);
+ qint16x8_t denominator = vdupq_n_qs16(0);
+ const qint16x8_t epsilon_vec = vdupq_n_qs16(sqcvt_qs16_f32(epsilon, fixed_point_position));
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ if(slice != id.z())
+ {
+ // Conctruct vectors
+ mean_vec = vdupq_n_qs16(*(input_mean + id.z()));
+ var_vec = vdupq_n_qs16(*(input_var + id.z()));
+ gamma_vec = vdupq_n_qs16(*(input_gamma + id.z()));
+ beta_vec = vdupq_n_qs16(*(input_beta + id.z()));
+
+ // Calculate denominator
+ denominator = vqinvsqrtq_qs16(vqaddq_qs16(var_vec, epsilon_vec), fixed_point_position);
+ slice = id.z();
+ }
+
+ // Calculate x bar and store results
+ const qint16x8_t numerator = vqsubq_qs16(vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr())), mean_vec);
+ const qint16x8_t x_bar = vqmulq_qs16(numerator, denominator, fixed_point_position);
+ vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmlaq_qs16(beta_vec, x_bar, gamma_vec, fixed_point_position));
+ },
+ input, output);
+}
+
+void batch_normalization_fp32(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
{
Iterator input(in, window);
Iterator output(out, window);
@@ -125,29 +169,78 @@
input, output);
}
-void NEBatchNormalizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+#ifdef ARM_COMPUTE_ENABLE_FP16
+void batch_normalization_fp16(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ Iterator input(in, window);
+ Iterator output(out, window);
+
+ // Hold information about the current feature map we are iterating.
+ // Only compute denominator and NEON vectors once per feature map.
+ int slice = -1;
+
+ const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma = reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
+ const auto input_beta = reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0)));
+
+ float16x8_t mean_vec = vdupq_n_f16(0.0);
+ float16x8_t var_vec = vdupq_n_f16(0.0);
+ float16x8_t gamma_vec = vdupq_n_f16(0.0);
+ float16x8_t beta_vec = vdupq_n_f16(0.0);
+ float16x8_t denominator = vdupq_n_f16(0.0);
+ const float16x8_t epsilon_vec = vdupq_n_f16(epsilon);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ if(slice != id.z())
+ {
+ // Conctruct vectors
+ mean_vec = vdupq_n_f16(*(input_mean + id.z()));
+ var_vec = vdupq_n_f16(*(input_var + id.z()));
+ gamma_vec = vdupq_n_f16(*(input_gamma + id.z()));
+ beta_vec = vdupq_n_f16(*(input_beta + id.z()));
+
+ // Calculate denominator
+ denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
+ slice = id.z();
+ }
+
+ // Calculate x bar and store results
+ const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
+ const float16x8_t x_bar = vmulq_f16(numerator, denominator);
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec)));
+ },
+ input, output);
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
_input = input;
- _output = output;
+ _output = input;
_mean = mean;
_var = var;
_gamma = gamma;
_beta = beta;
_epsilon = epsilon;
+ if(output != nullptr)
+ {
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+ _output = output;
+ }
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, mean, var, beta, gamma);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, mean, var, beta, gamma);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+
unsigned int num_elems_processed_per_iteration = 0;
switch(input->info()->data_type())
@@ -156,29 +249,43 @@
_func = &batch_normalization_q8;
num_elems_processed_per_iteration = 16;
break;
+ case DataType::QS16:
+ _func = &batch_normalization_q16;
+ num_elems_processed_per_iteration = 8;
+ break;
case DataType::F32:
_func = &batch_normalization_fp32;
num_elems_processed_per_iteration = 4;
break;
+ case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ _func = &batch_normalization_fp16;
+ num_elems_processed_per_iteration = 8;
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
default:
ARM_COMPUTE_ERROR("Element size not supported");
break;
}
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region());
-
+ if(output != nullptr)
+ {
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+ }
+ else
+ {
+ update_window_and_padding(win, input_access);
+ }
INEKernel::configure(win);
}
-void NEBatchNormalizationLayerKernel::run(const Window &window)
+void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index e8e448e..3888300 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -93,8 +93,9 @@
INEKernel::configure(win);
}
-void NEBitwiseAndKernel::run(const Window &window)
+void NEBitwiseAndKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Iterator input1(_input1, window);
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index bf75592..08d7fe2 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -81,8 +81,9 @@
INEKernel::configure(win);
}
-void NEBitwiseNotKernel::run(const Window &window)
+void NEBitwiseNotKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Iterator input(_input, window);
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index f184be2..1b17cc2 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -93,8 +93,9 @@
INEKernel::configure(win);
}
-void NEBitwiseOrKernel::run(const Window &window)
+void NEBitwiseOrKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Iterator input1(_input1, window);
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index c4fb4c0..9451e8a 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -89,8 +89,9 @@
INEKernel::configure(win);
}
-void NEBitwiseXorKernel::run(const Window &window)
+void NEBitwiseXorKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Iterator input1(_input1, window);
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
index d7e6d73..d7178e4 100644
--- a/src/core/NEON/kernels/NEBox3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
@@ -34,8 +34,9 @@
using namespace arm_compute;
#ifdef ARM_COMPUTE_ENABLE_FP16
-void NEBox3x3FP16Kernel::run(const Window &window)
+void NEBox3x3FP16Kernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
@@ -103,7 +104,7 @@
},
input, output);
}
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
BorderSize NEBox3x3Kernel::border_size() const
{
@@ -144,8 +145,9 @@
INEKernel::configure(win);
}
-void NEBox3x3Kernel::run(const Window &window)
+void NEBox3x3Kernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index 85a2cd5..bcbe790 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
@@ -787,7 +787,7 @@
INEKernel::configure(win);
}
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
namespace
{
@@ -1677,8 +1677,9 @@
INEKernel::configure(win);
}
-void NEGradientKernel::run(const Window &window)
+void NEGradientKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -1758,8 +1759,9 @@
INEKernel::configure(win);
}
-void NEEdgeNonMaxSuppressionKernel::run(const Window &window)
+void NEEdgeNonMaxSuppressionKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -1838,8 +1840,9 @@
INEKernel::configure(win);
}
-void NEEdgeTraceKernel::run(const Window &window)
+void NEEdgeTraceKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Iterator input(_input, window);
diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
index 3147a69..a2b24de 100644
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
@@ -284,8 +284,9 @@
return _is_parallelizable;
}
-void NEChannelCombineKernel::run(const Window &window)
+void NEChannelCombineKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
index ebc4b85..bac2471 100644
--- a/src/core/NEON/kernels/NEChannelExtractKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
@@ -148,12 +148,11 @@
_input = input;
_output = output;
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowRectangle output_access(input->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / subsampling, 1.f / subsampling);
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / subsampling, 1.f / subsampling);
- update_window_and_padding(win,
- AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
- output_access);
+ update_window_and_padding(win, input_access, output_access);
ValidRegion input_valid_region = input->info()->valid_region();
@@ -257,16 +256,17 @@
_output = output;
Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
AccessWindowHorizontal input_access(_input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, _input->info()->valid_region());
INEKernel::configure(win);
}
-void NEChannelExtractKernel::run(const Window &window)
+void NEChannelExtractKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index 6d370ac..460d37e 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -69,20 +69,21 @@
void NECol2ImKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- set_data_type_if_unknown(*output->info(), input->info()->data_type());
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
TensorShape output_shape = input->info()->tensor_shape();
output_shape.set(0, convolved_dims.first);
output_shape.set(1, convolved_dims.second);
output_shape.set(2, input->info()->tensor_shape()[0]);
- set_shape_if_empty(*output->info(), output_shape);
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
_input = input;
_output = output;
@@ -115,8 +116,9 @@
INEKernel::configure(win);
}
-void NECol2ImKernel::run(const Window &window)
+void NECol2ImKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp
index cb5152e..347aeae 100644
--- a/src/core/NEON/kernels/NEColorConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp
@@ -572,8 +572,9 @@
INEKernel::configure(win);
}
-void NEColorConvertKernel::run(const Window &window)
+void NEColorConvertKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
index 30e91ef..263fbe0 100644
--- a/src/core/NEON/kernels/NEConvolutionKernel.cpp
+++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp
@@ -621,8 +621,9 @@
}
template <unsigned int matrix_size>
-void NEConvolutionKernel<matrix_size>::run(const Window &window)
+void NEConvolutionKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
@@ -694,8 +695,9 @@
}
template <unsigned int matrix_size>
-void NESeparableConvolutionHorKernel<matrix_size>::run(const Window &window)
+void NESeparableConvolutionHorKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
switch(_output->info()->data_type())
@@ -1131,8 +1133,9 @@
}
template <unsigned int matrix_size>
-void NESeparableConvolutionVertKernel<matrix_size>::run(const Window &window)
+void NESeparableConvolutionVertKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
@@ -1464,8 +1467,9 @@
INEKernel::configure(win);
}
-void NEConvolutionRectangleKernel::run(const Window &window)
+void NEConvolutionRectangleKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
index 32789cb..b65f3ba 100644
--- a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
@@ -67,8 +67,9 @@
INEKernel::configure(calculate_max_window(*input->info()));
}
-void NECumulativeDistributionKernel::run(const Window &window)
+void NECumulativeDistributionKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_distribution->buffer() == nullptr);
diff --git a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
index 902490e..7a62b0c 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
@@ -27,17 +27,76 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
#include <arm_neon.h>
+#include <cstdint>
using namespace arm_compute;
+namespace
+{
+// Overloads of 128-bit vector loads
+uint8x16_t loadq(const uint8_t *ptr)
+{
+ return vld1q_u8(ptr);
+}
+uint16x8_t loadq(const uint16_t *ptr)
+{
+ return vld1q_u16(ptr);
+}
+uint32x4_t loadq(const uint32_t *ptr)
+{
+ return vld1q_u32(ptr);
+}
+// Overloads of 128-bit vector stores
+void storeq(uint8_t *ptr, uint8x16_t val)
+{
+ return vst1q_u8(ptr, val);
+}
+void storeq(uint16_t *ptr, uint16x8_t val)
+{
+ return vst1q_u16(ptr, val);
+}
+void storeq(uint32_t *ptr, uint32x4_t val)
+{
+ return vst1q_u32(ptr, val);
+}
+
+template <typename T>
+void depth_concat(const ITensor *in, ITensor *out, std::pair<int, int> start_xy, int depth_offset, const Window &window)
+{
+ const int start_x = start_xy.first;
+ const int start_y = start_xy.second;
+
+ // Offset input
+ const int input_offset_to_first_elements_in_bytes = in->info()->offset_first_element_in_bytes() - start_x * in->info()->strides_in_bytes()[0] - start_y * in->info()->strides_in_bytes()[1];
+ uint8_t *input_ptr = in->buffer() + input_offset_to_first_elements_in_bytes;
+
+ // Offset output
+ const unsigned int output_offset_to_first_elements_in_bytes = out->info()->offset_first_element_in_bytes() + depth_offset * out->info()->strides_in_bytes()[2];
+ uint8_t *output_ptr = out->buffer() + output_offset_to_first_elements_in_bytes;
+
+ Iterator input(in, window);
+ Iterator output(out, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const T *>(input_ptr + input.offset());
+ const auto out_ptr = reinterpret_cast<T *>(output_ptr + output.offset());
+
+ storeq(out_ptr, loadq(in_ptr));
+ },
+ input, output);
+}
+} // namespace
+
NEDepthConcatenateKernel::NEDepthConcatenateKernel()
- : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
+ : _func(nullptr), _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
{
}
@@ -48,8 +107,9 @@
void NEDepthConcatenateKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
@@ -60,18 +120,36 @@
ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+ _func = nullptr;
_input = input;
_output = output;
_depth_offset = depth_offset;
_left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
_top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
- const unsigned int num_elems_processed_per_iteration = 4;
- const unsigned int num_elems_read_per_iteration = 4;
+ switch(input->info()->data_type())
+ {
+ case DataType::QS8:
+ _func = &depth_concat<uint8_t>;
+ break;
+ case DataType::QS16:
+ case DataType::F16:
+ _func = &depth_concat<uint16_t>;
+ break;
+ case DataType::F32:
+ _func = &depth_concat<uint32_t>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ }
+
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+ const unsigned int num_elems_read_per_iteration = 16 / input->info()->element_size();
const unsigned int num_rows_read_per_iteration = 1;
// The window needs to be based on input as we copy all the depths of input
- Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size());
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
@@ -81,25 +159,12 @@
INEKernel::configure(win);
}
-void NEDepthConcatenateKernel::run(const Window &window)
+void NEDepthConcatenateKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
- // Offset output
- const unsigned int offset_to_first_elements_in_bytes = _output->info()->offset_first_element_in_bytes() + _left_right * _output->info()->strides_in_bytes()[0] + _top_bottom *
- _output->info()->strides_in_bytes()[1] + _depth_offset * _output->info()->strides_in_bytes()[2];
- uint8_t *output_ptr = _output->buffer() + offset_to_first_elements_in_bytes;
-
- Iterator input(_input, window);
- Iterator output(_output, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto in_ptr = reinterpret_cast<const float *>(input.ptr());
- const auto out_ptr = reinterpret_cast<float *>(output_ptr + output.offset());
-
- vst1q_f32(out_ptr, vld1q_f32(in_ptr));
- },
- input, output);
+ (*_func)(_input, _output, std::make_pair(_left_right, _top_bottom), _depth_offset, window);
}
diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
index 56612a7..d97a20b 100644
--- a/src/core/NEON/kernels/NEDepthConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
@@ -40,45 +40,91 @@
} // namespace arm_compute
NEDepthConvertKernel::NEDepthConvertKernel()
- : _policy(), _shift(0)
+ : _input(nullptr), _output(nullptr), _policy(), _shift(0), _fixed_point_position_input(0), _fixed_point_position_output(0)
{
}
-void NEDepthConvertKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEDepthConvertKernel::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32);
- ARM_COMPUTE_ERROR_ON(shift >= 8);
- ARM_COMPUTE_ERROR_ON(input == output);
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data_types must be different");
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::F32),
- "Only data_types supported [in] QS8 -> [out] F32");
+ _input = input;
+ _output = input;
+ _policy = policy;
+ _shift = shift;
+
+ if(output != nullptr)
+ {
+ // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+ // Set output
+ _output = output;
+ }
+
+ // Set initial fixed point position of input and output
+ _fixed_point_position_input = input->info()->fixed_point_position();
+ _fixed_point_position_output = _output->info()->fixed_point_position();
+
+ // Set the fixed point position to the output tensor if needed
+ if(is_data_type_fixed_point(input->info()->data_type()) && is_data_type_fixed_point(_output->info()->data_type()))
+ {
+ // If in-place set the fixed point position of the output tensor to be equal to shift
+ _fixed_point_position_output = (_input == _output) ? static_cast<int>(_shift) : _fixed_point_position_output;
+ // Set fixed point position to output tensor
+ _output->info()->set_fixed_point_position(_fixed_point_position_output);
+ }
+
+ ARM_COMPUTE_ERROR_ON(shift >= 8 && (!is_data_type_fixed_point(input->info()->data_type()) && !is_data_type_fixed_point(output->info()->data_type())));
+ ARM_COMPUTE_ERROR_ON(input == output && (data_size_from_type(input->info()->data_type()) != data_size_from_type(output->info()->data_type())));
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
&& output->info()->data_type() != DataType::S32),
"Only data_types supported [in] U8 -> [out] U16, S16, S32");
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::F32),
+ "Only data_types supported [in] QS8 -> [out] QS8, F32");
+
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
"Only data_types supported [in] U16 -> [out] U8, U32");
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
"Only data_types supported [in] S16 -> [out] U8, S32");
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8),
- "Only data_types supported [in] F32 -> [out] QS8");
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::QS16 && output->info()->data_type() != DataType::F32),
+ "Only data_types supported [in] QS16 -> [out] QS16, F32");
- _policy = policy;
- _shift = shift;
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::QS16),
+ "Only data_types supported [in] F32 -> [out] QS8, QS16");
constexpr unsigned int num_elems_processed_per_iteration = 16;
- INESimpleKernel::configure(input, output, num_elems_processed_per_iteration);
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ if(output != nullptr)
+ {
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+ }
+ else
+ {
+ // In-place computation
+ update_window_and_padding(win, input_access);
+ }
+ ICPPKernel::configure(win);
}
-void NEDepthConvertKernel::run(const Window &window)
+void NEDepthConvertKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
ARM_COMPUTE_ERROR_ON(nullptr == _input);
ARM_COMPUTE_ERROR_ON(nullptr == _output);
ARM_COMPUTE_ERROR_ON(_input == _output);
@@ -86,37 +132,10 @@
Iterator input(_input, window);
Iterator output(_output, window);
+ bool in_place = (_input == _output);
+
switch(_input->info()->data_type())
{
- case DataType::QS8:
- {
- const int fixed_point_position = _input->info()->fixed_point_position();
-
- switch(_output->info()->data_type())
- {
- case DataType::F32:
- {
- /* Up-conversion QS8 -> F32 */
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<const int8_t *>(input.ptr()));
-
- float32x4x2_t texels_low = vcvt_f32_qs8(vget_low_s8(texels_s8), fixed_point_position);
- float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_s8), fixed_point_position);
-
- vst1q_f32(reinterpret_cast<float *>(output.ptr()), texels_low.val[0]);
- vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, texels_low.val[1]);
- vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, texels_high.val[0]);
- vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, texels_high.val[1]);
- },
- input, output);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Output data type not supported");
- }
- break;
- }
case DataType::U8:
{
const int16x8_t b = vdupq_n_s16(_shift);
@@ -193,6 +212,49 @@
}
break;
}
+ case DataType::QS8:
+ {
+ switch(_output->info()->data_type())
+ {
+ case DataType::QS8:
+ {
+ const int relative_shift = _fixed_point_position_output - _fixed_point_position_input;
+ /* Fixed point position conversion QS8 -> QS8 */
+ if(relative_shift != 0 || !in_place)
+ {
+ const auto relative_shift_vec = vdupq_n_qs8(relative_shift);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr()));
+ vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqrshlq_s8(texels_qs8, relative_shift_vec));
+ },
+ input, output);
+ }
+ break;
+ }
+ case DataType::F32:
+ {
+ /* Up-conversion QS8 -> F32 */
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr()));
+
+ float32x4x2_t texels_low = vcvt_f32_qs8(vget_low_s8(texels_qs8), _fixed_point_position_input);
+ float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_qs8), _fixed_point_position_input);
+
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), texels_low.val[0]);
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, texels_low.val[1]);
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, texels_high.val[0]);
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, texels_high.val[1]);
+ },
+ input, output);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Output data type not supported");
+ }
+ break;
+ }
case DataType::S16:
{
switch(_output->info()->data_type())
@@ -346,13 +408,65 @@
}
break;
}
+ case DataType::QS16:
+ {
+ switch(_output->info()->data_type())
+ {
+ case DataType::QS16:
+ {
+ const int relative_shift = _fixed_point_position_output - _fixed_point_position_input;
+ /* Fixed point position conversion QS16 -> QS16 */
+ if(relative_shift != 0 || !in_place)
+ {
+ const auto relative_shift_vec = vdupq_n_qs16(relative_shift);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const qint16x8x2_t texels_qs16 =
+ {
+ {
+ vld1q_qs16(reinterpret_cast<qint16_t *>(input.ptr())),
+ vld1q_qs16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
+ }
+ };
+ vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqrshlq_s16(texels_qs16.val[0], relative_shift_vec));
+ vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqrshlq_s16(texels_qs16.val[1], relative_shift_vec));
+ },
+ input, output);
+ }
+ break;
+ }
+ case DataType::F32:
+ {
+ /* Up-conversion QS16 -> F32 */
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const int16x8x2_t texels_qs16 =
+ {
+ {
+ vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr())),
+ vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
+ }
+ };
+
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvt_f32_qs16(vget_low_s16(texels_qs16.val[0]), _fixed_point_position_input));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[0]), _fixed_point_position_input));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvt_f32_qs16(vget_low_s16(texels_qs16.val[1]), _fixed_point_position_input));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[1]), _fixed_point_position_input));
+ },
+ input, output);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Output data type not supported");
+ }
+ break;
+ }
case DataType::F32:
{
switch(_output->info()->data_type())
{
case DataType::QS8:
{
- const int fixed_point_position = _output->info()->fixed_point_position();
/* Down-conversion F32 -> QS8 */
execute_window_loop(window, [&](const Coordinates & id)
{
@@ -366,13 +480,39 @@
}
};
- const qint8x16_t texels_s8 = vcvtq_qs8_f32(texels_f32, fixed_point_position);
+ const qint8x16_t texels_s8 = vqcvtq_qs8_f32(texels_f32, _fixed_point_position_output);
vst1q_s8(reinterpret_cast<int8_t *>(output.ptr()), texels_s8);
},
input, output);
break;
}
+ case DataType::QS16:
+ {
+ /* Down-conversion F32 -> QS16 */
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float32x4x2_t texels_f32_1 =
+ {
+ {
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
+ }
+ };
+ const float32x4x2_t texels_f32_2 =
+ {
+ {
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
+ }
+ };
+
+ vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()), vqcvtq_qs16_f32(texels_f32_1, _fixed_point_position_output));
+ vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqcvtq_qs16_f32(texels_f32_2, _fixed_point_position_output));
+ },
+ input, output);
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Output data type not supported");
}
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
new file mode 100644
index 0000000..70984f0
--- /dev/null
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+NEDequantizationLayerKernel::NEDequantizationLayerKernel()
+ : _input(nullptr), _output(nullptr), _min_max(nullptr)
+{
+}
+
+void NEDequantizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(min_max);
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::F32, 0);
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+ _input = input;
+ _output = output;
+ _min_max = min_max;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+ // Configure window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowStatic min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+
+ // Update window and padding
+ update_window_and_padding(win, input_access, output_access, min_max_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ INEKernel::configure(win);
+}
+
+void NEDequantizationLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ Window window_input_output(window);
+ window_input_output.collapse_if_possible(INEKernel::window(), 3);
+ window_input_output.set(3, Window::Dimension(0, 1, 1));
+
+ Window window_min_max;
+ window_min_max.use_tensor_dimensions(_min_max->info()->tensor_shape());
+ window_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
+ window_min_max.collapse_if_possible(INEKernel::window(), 1);
+
+ Iterator input(_input, window_input_output);
+ Iterator output(_output, window_input_output);
+ Iterator min_max(_min_max, window_min_max);
+
+ execute_window_loop(window_min_max, [&](const Coordinates & id_batch)
+ {
+ // Get the min and max
+ const float min = *(reinterpret_cast<const float *>(min_max.ptr()) + 0);
+ const float max = *(reinterpret_cast<const float *>(min_max.ptr()) + 1);
+
+ const float32x4_t vmin = vdupq_n_f32(min);
+ const float range = max - min;
+ const float32x4_t scaling = vdupq_n_f32(range / 255.0f);
+
+ // Uniformly map values to range 8bit integers, i.e. [min, max] -> [0, 255]
+ execute_window_loop(window_input_output, [&](const Coordinates & id)
+ {
+ // Get the input values
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
+
+ const uint8x8_t val_u8 = vld1_u8(input_ptr);
+ const uint16x8_t val_u16 = vmovl_u8(val_u8);
+ const uint32x4_t val_u32_low = vmovl_u16(vget_low_u16(val_u16));
+ const uint32x4_t val_u32_high = vmovl_u16(vget_high_u16(val_u16));
+ float32x4_t val_low = vcvtq_f32_u32(val_u32_low);
+ float32x4_t val_high = vcvtq_f32_u32(val_u32_high);
+
+ // Dequantize -> (q / 255.0 * range) + min
+ val_low = vmulq_f32(val_low, scaling);
+ val_high = vmulq_f32(val_high, scaling);
+ val_low = vaddq_f32(val_low, vmin);
+ val_high = vaddq_f32(val_high, vmin);
+
+ const float32x4x2_t dequantized = vuzpq_f32(val_low, val_high);
+
+ // Store the dequantized values
+ auto output_ptr = reinterpret_cast<float *>(output.ptr() + id_batch[1] * _output->info()->strides_in_bytes()[3]);
+ vst2q_f32(output_ptr, dequantized);
+ },
+ input, output);
+ },
+ min_max);
+}
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp
index bf7e097..a5680eb 100644
--- a/src/core/NEON/kernels/NEDerivativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDerivativeKernel.cpp
@@ -214,8 +214,9 @@
in, out_x, out_y);
}
-void NEDerivativeKernel::run(const Window &window)
+void NEDerivativeKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEDilateKernel.cpp b/src/core/NEON/kernels/NEDilateKernel.cpp
index 867cf77..3ee00a4 100644
--- a/src/core/NEON/kernels/NEDilateKernel.cpp
+++ b/src/core/NEON/kernels/NEDilateKernel.cpp
@@ -67,8 +67,9 @@
INEKernel::configure(win);
}
-void NEDilateKernel::run(const Window &window)
+void NEDilateKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
index effc50e..6631359 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
@@ -54,6 +54,11 @@
return vld1q_qs16(in);
}
+inline qint32x4_t internal_vld1q(const qint32_t *in)
+{
+ return vld1q_s32(in);
+}
+
// Internal store
inline void internal_vst1q(float *p, const float32x4_t &v)
{
@@ -72,6 +77,16 @@
vst1q_qs16(p, v);
}
+inline void internal_vst1q(qint32_t *p, const qint32x4_t &v)
+{
+ vst1q_s32(p, v);
+}
+
+inline void internal_vst1q(qint16_t *p, const qint32x4_t &v)
+{
+ vst1_qs16(p, vqmovn_qs32(v));
+}
+
// Internal vdup
inline float32x4_t internal_vdupq_n(float v)
{
@@ -86,6 +101,11 @@
return vdupq_n_qs16(v);
}
+inline qint32x4_t internal_vdupq_n(qint32_t v)
+{
+ return vdupq_n_qs32(v);
+}
+
// Internal vadd
inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
{
@@ -99,6 +119,29 @@
{
return vqaddq_qs16(x, y);
}
+inline qint32x4_t internal_vqaddq(const qint32x4_t &x, const qint32x4_t &y)
+{
+ return vqaddq_qs32(x, y);
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+inline float16x8_t internal_vld1q(const float16_t *in)
+{
+ return vld1q_f16(in);
+}
+inline void internal_vst1q(float16_t *p, const float16x8_t &v)
+{
+ vst1q_f16(p, v);
+}
+inline float16x8_t internal_vdupq_n(float16_t v)
+{
+ return vdupq_n_f16(v);
+}
+inline float16x8_t internal_vqaddq(const float16x8_t &x, const float16x8_t &y)
+{
+ return vaddq_f16(x, y);
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
template <typename T1, typename T2, bool in_place>
void accumulate_bias(ITensor *input, const ITensor *bias, const Window window, ITensor *output)
@@ -143,8 +186,8 @@
void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
ARM_COMPUTE_ERROR_ON(input->info()->fixed_point_position() != bias->info()->fixed_point_position());
if(output != nullptr)
{
@@ -179,26 +222,53 @@
INEKernel::configure(win);
// Set appropriate function
- if(input->info()->data_type() == DataType::F32)
+ switch(input->info()->data_type())
{
- _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
- }
- else if(input->info()->data_type() == DataType::QS8)
- {
- _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
- }
- else if(input->info()->data_type() == DataType::QS16 && bias->info()->data_type() == DataType::QS8)
- {
- _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+ case DataType::QS8:
+ {
+ _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
+ break;
+ }
+ case DataType::QS16:
+ {
+ if(bias->info()->data_type() == DataType::QS8)
+ {
+ _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+ break;
+ }
+ case DataType::QS32:
+ {
+ _func = (output == nullptr) ? &accumulate_bias<qint32_t, qint16_t, true> : &accumulate_bias<qint32_t, qint16_t, false>;
+ break;
+ }
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ {
+ _func = (output == nullptr) ? &accumulate_bias<float16_t, float16_t, true> : &accumulate_bias<float16_t, float16_t, false>;
+ break;
+ }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+ case DataType::F32:
+ {
+ _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+ break;
+ }
}
}
-void NEDirectConvolutionLayerBiasAccumulateKernel::run(const Window &window)
+void NEDirectConvolutionLayerBiasAccumulateKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index d608898..c8e1113 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -30,6 +30,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include <algorithm>
@@ -40,6 +41,81 @@
namespace
{
template <unsigned int stridex>
+qint16x8_t internal_vld1q(const qint16_t *in);
+
+template <>
+qint16x8_t internal_vld1q<1>(const qint16_t *in)
+{
+ return vld1q_qs16(in);
+}
+
+template <>
+qint16x8_t internal_vld1q<2>(const qint16_t *in)
+{
+ const int16x8x2_t tmp = vld2q_s16(in);
+ return tmp.val[0];
+}
+
+template <>
+qint16x8_t internal_vld1q<3>(const qint16_t *in)
+{
+ const int16x8x3_t tmp = vld3q_s16(in);
+ return tmp.val[0];
+}
+
+inline qint16x8_t internal_vdupq_n(qint16_t v)
+{
+ return vdupq_n_qs16(v);
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+template <unsigned int stridex>
+float16x8_t internal_vld1q(const float16_t *in);
+
+template <>
+float16x8_t internal_vld1q<1>(const float16_t *in)
+{
+ return vld1q_f16(in);
+}
+
+template <>
+float16x8_t internal_vld1q<2>(const float16_t *in)
+{
+ const float16x8x2_t tmp = vld2q_f16(in);
+ return tmp.val[0];
+}
+
+template <>
+float16x8_t internal_vld1q<3>(const float16_t *in)
+{
+ const float16x8x3_t tmp = vld3q_f16(in);
+ return tmp.val[0];
+}
+
+inline float16x8_t internal_vdupq_n(float16_t v)
+{
+ return vdupq_n_f16(v);
+}
+
+inline void internal_vst1q(float16_t *p, const float16x8_t &v)
+{
+ vst1q_f16(p, v);
+}
+
+float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+ return vmulq_f16(x, y);
+}
+
+inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+ return vaddq_f16(x, vmulq_f16(y, z));
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+template <unsigned int stridex>
float32x4_t internal_vld1q(const float *in);
template <>
@@ -62,6 +138,28 @@
return tmp.val[0];
}
+inline float32x4_t internal_vdupq_n(float v)
+{
+ return vdupq_n_f32(v);
+}
+
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+ vst1q_f32(p, v);
+}
+
+float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+ return vmulq_f32(x, y);
+}
+
+inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+ return vmlaq_f32(x, y, z);
+}
+
template <unsigned int stridex>
qint8x8_t internal_vld1q(const qint8_t *in);
@@ -85,28 +183,19 @@
return tmp.val[0];
}
-template <unsigned int stridex>
-qint16x8_t internal_vld1q(const qint16_t *in);
-
-template <>
-qint16x8_t internal_vld1q<1>(const qint16_t *in)
-{
- return vld1q_s16(in);
-}
-
-inline float32x4_t internal_vdupq_n(float v)
-{
- return vdupq_n_f32(v);
-}
-
inline qint8x8_t internal_vdupq_n(qint8_t v)
{
return vdup_n_qs8(v);
}
-inline void internal_vst1q(float *p, const float32x4_t &v)
+inline qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
{
- vst1q_f32(p, v);
+ return vmull_qs8(x, y, fixed_point_position);
+}
+
+inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
+{
+ return vqmlal_qs8(x, y, z, fixed_point_position);
}
inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
@@ -114,28 +203,140 @@
vst1q_qs16(p, v);
}
-float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+inline void internal_vst1q(int *p, const qint32x4x2_t &v)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
- return vmulq_f32(x, y);
+ vst1q_s32(p, v.val[0]);
+ vst1q_s32(p + 4, v.val[1]);
}
-qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
+template <unsigned int stridex>
+qint32x4x2_t internal_vld1q(const qint32_t *in);
+
+template <>
+qint32x4x2_t internal_vld1q<1>(const qint32_t *in)
{
- return vmull_qs8(x, y, fixed_point_position);
+ const qint32x4x2_t r =
+ {
+ {
+ vld1q_s32(in),
+ vld1q_s32(in + 4)
+ }
+ };
+ return r;
}
-inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+inline qint32x4x2_t internal_vmull(const qint16x8_t &x, const qint16x8_t &y, int fixed_point_position)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
- return vmlaq_f32(x, y, z);
+ const qint32x4x2_t r =
+ {
+ {
+ vmull_qs16(vget_low_s16(x), vget_low_s16(y), fixed_point_position),
+ vmull_qs16(vget_high_s16(x), vget_high_s16(y), fixed_point_position),
+ }
+ };
+ return r;
}
-inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
+inline qint32x4x2_t internal_vmlal(const qint32x4x2_t &x, const qint16x8_t &y, const qint16x8_t &z, int fixed_point_position)
{
- return vqmlal_qs8(x, y, z, fixed_point_position);
+ const qint32x4x2_t r =
+ {
+ {
+ vqmlal_qs16(x.val[0], vget_low_s16(y), vget_low_s16(z), fixed_point_position),
+ vqmlal_qs16(x.val[1], vget_high_s16(y), vget_high_s16(z), fixed_point_position)
+ }
+ };
+ return r;
}
+constexpr int SmallTensorSizeOptim = 8;
+inline bool run_optim_small_tensor(const ITensor *t)
+{
+ return t->info()->dimension(Window::DimX) <= SmallTensorSizeOptim && t->info()->dimension(Window::DimY) <= SmallTensorSizeOptim;
+}
+
+// Optimized convolver for 1x1 kernels used only where input width and height are both <= 8
+// For big Z as in Input=7x7x832, this implementation is faster than the general code becuase it doesn't need to
+// store intermidiate results in memory. Temporary results are stored in NEON registers directly and then written to the output buffer.
+template <unsigned int stridex>
+class convolver_w1x1_i8x8_f32
+{
+public:
+ static void convolve(const Window &window, const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ {
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) > SmallTensorSizeOptim);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) > SmallTensorSizeOptim);
+
+ const int input_stride_y = input->info()->strides_in_bytes().y();
+ const int input_stride_z = input->info()->strides_in_bytes().z();
+ const int output_stride_y = output->info()->strides_in_bytes().y();
+ const int output_stride_z = output->info()->strides_in_bytes().z();
+ const int kernel_stride_z = weights->info()->strides_in_bytes().z();
+ const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
+ const int output_h = output->info()->dimension(1);
+ const int range_z = window.z().end() - window.z().start();
+ const int kernel_depth = weights->info()->dimension(Window::DimZ);
+ const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+
+ // setup output window for the iterator
+ Window window_out = window;
+ window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
+ window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+ window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
+
+ // setup input window for the iterator
+ Window window_in = window;
+ // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+ window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+ Iterator out(output, window_out);
+ Iterator in(input, window_in);
+ Iterator k(weights, window_k);
+
+ const uint8_t *k_ptr = k.ptr();
+
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ const uint8_t *input_ptr = in.ptr();
+ uint8_t *out_ptr = out.ptr();
+ int ih = 0;
+ int oh = 0;
+ float32x4_t accum0[SmallTensorSizeOptim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
+ float32x4_t accum1[SmallTensorSizeOptim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
+ for(int oz = 0; oz < range_z; ++oz)
+ {
+ accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
+ accum1[0] = accum1[1] = accum1[2] = accum1[3] = accum1[4] = accum1[5] = accum1[6] = accum1[7] = vdupq_n_f32(0.f);
+ auto p_out_base = out_ptr + oz * output_stride_z;
+ for(int p = 0; p < kernel_depth; ++p)
+ {
+ const auto k_val = reinterpret_cast<const float *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
+ const auto vk0 = internal_vdupq_n(*k_val);
+ for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+ {
+ const int offset_xy = ih * input_stride_y;
+ auto in_val = reinterpret_cast<const float *>(input_ptr + p * input_stride_z + offset_xy);
+ auto v_in0 = internal_vld1q<stridex>(in_val);
+ auto v_in1 = internal_vld1q<stridex>(in_val + 4);
+ accum0[oh] = vmlaq_f32(accum0[oh], vk0, v_in0);
+ accum1[oh] = vmlaq_f32(accum1[oh], vk0, v_in1);
+ }
+ }
+ for(oh = 0; oh < output_h; ++oh)
+ {
+ auto p_out = reinterpret_cast<float *>(p_out_base + oh * output_stride_y);
+ vst1q_f32(p_out, accum0[oh]);
+ vst1q_f32(p_out + 4, accum1[oh]);
+ }
+ }
+ },
+ in, out);
+ }
+};
+
template <typename T1, typename T2, unsigned int stridex>
class convolver_1x1
{
@@ -169,8 +370,7 @@
window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
+ Window window_k = calculate_max_window(*weights->info(), Steps(1u));
Iterator out(output, window_out);
Iterator in(input, window_in);
Iterator k(weights, window_k);
@@ -204,6 +404,7 @@
}
}
}
+
// Step 2
for(int p = 1; p < kernel_depth; ++p)
{
@@ -226,6 +427,148 @@
}
};
+#ifdef ARM_COMPUTE_ENABLE_FP16
+inline float16x8x3_t load_matrix_row(const float16_t *ptr)
+{
+ /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
+ r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
+ const float16x8x3_t r =
+ {
+ {
+ vld1q_dup_f16(ptr),
+ vld1q_dup_f16(1 + ptr),
+ vld1q_dup_f16(2 + ptr)
+ }
+ };
+ return r;
+}
+
+template <unsigned int stridex>
+float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+ int fixed_point_position);
+
+template <>
+float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+ int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+
+ const float16x8x3_t vtop =
+ {
+ {
+ vld1q_f16(in_top),
+ vld1q_f16(in_top + 8),
+ vld1q_f16(in_top + 16)
+ }
+ };
+ const float16x8x3_t vmid =
+ {
+ {
+ vld1q_f16(in_mid),
+ vld1q_f16(in_mid + 8),
+ vld1q_f16(in_mid + 16)
+ }
+ };
+ const float16x8x3_t vlow =
+ {
+ {
+ vld1q_f16(in_low),
+ vld1q_f16(in_low + 8),
+ vld1q_f16(in_low + 16)
+ }
+ };
+ float16x8x2_t out =
+ {
+ {
+ vmulq_f16(vtop.val[0], m0.val[0]),
+ vmulq_f16(vtop.val[1], m0.val[0])
+ }
+ };
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0]));
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 1), m1.val[1]));
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 2), m1.val[2]));
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0]));
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 1), m2.val[1]));
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 2), m2.val[2]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 1), m0.val[1]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 2), m0.val[2]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vmid.val[1], m1.val[0]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 1), m1.val[1]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 2), m1.val[2]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vlow.val[1], m2.val[0]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));
+ return out;
+}
+
+template <>
+inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+ int fixed_point_position)
+{
+ float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 2);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 3);
+ return out;
+}
+
+template <>
+inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+ int fixed_point_position)
+{
+ float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
+ return out;
+}
+
+template <unsigned int stridex>
+void store_results(float16_t *buffer, const float16x8x2_t &values);
+
+template <>
+void store_results<1>(float16_t *buffer, const float16x8x2_t &values)
+{
+ vst1q_f16(buffer, values.val[0]);
+ vst1q_f16(buffer + 8, values.val[1]);
+}
+
+template <>
+void store_results<2>(float16_t *buffer, const float16x8x2_t &values)
+{
+ vst1q_f16(buffer, values.val[0]);
+}
+
+template <>
+void store_results<3>(float16_t *buffer, const float16x8x2_t &values)
+{
+ vst1_f16(buffer, vget_low_f16(values.val[0]));
+}
+
+template <unsigned int stridex>
+void accumulate_results(float16_t *buffer, const float16x8x2_t &values);
+
+template <>
+void accumulate_results<1>(float16_t *buffer, const float16x8x2_t &values)
+{
+ vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));
+ vst1q_f16(buffer + 8, vaddq_f16(vld1q_f16(buffer + 8), values.val[1]));
+}
+
+template <>
+void accumulate_results<2>(float16_t *buffer, const float16x8x2_t &values)
+{
+ vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));
+}
+
+template <>
+void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values)
+{
+ vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0])));
+}
+
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
inline float32x4x3_t load_matrix_row(const float *ptr)
{
const float32x4x3_t r =
@@ -254,6 +597,159 @@
}
template <unsigned int stridex>
+float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
+ const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position);
+
+inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)
+{
+ const float32x4x3_t m00 =
+ {
+ {
+ vld1q_dup_f32(m0),
+ vld1q_dup_f32(m1),
+ vld1q_dup_f32(m2)
+ }
+ };
+ return m00;
+}
+
+inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4)
+{
+ const float32x4x2_t m00 =
+ {
+ {
+ vld1q_dup_f32(m3),
+ vld1q_dup_f32(m4)
+ }
+ };
+ return m00;
+}
+
+inline float32x4x3_t load_input(const float *const in)
+{
+ const float32x4x3_t vin =
+ {
+ {
+ vld1q_f32(in),
+ vld1q_f32(in + 4),
+ vld1q_f32(in + 8)
+ }
+ };
+ return vin;
+}
+
+template <>
+inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
+ const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+ const float32x4x3_t vin0 = load_input(in_0);
+ const float32x4x3_t vin1 = load_input(in_1);
+ const float32x4x3_t vin2 = load_input(in_2);
+ const float32x4x3_t vin3 = load_input(in_3);
+ const float32x4x3_t vin4 = load_input(in_4);
+ const float32x4x3_t m00 = load_matrix_hi(m0, 1 + m0, 2 + m0);
+ const float32x4x2_t m01 = load_matrix_lo(3 + m0, 4 + m0);
+ const float32x4x3_t m10 = load_matrix_hi(m1, 1 + m1, 2 + m1);
+ const float32x4x2_t m11 = load_matrix_lo(3 + m1, 4 + m1);
+ const float32x4x3_t m20 = load_matrix_hi(m2, 1 + m2, 2 + m2);
+ const float32x4x2_t m21 = load_matrix_lo(3 + m2, 4 + m2);
+ const float32x4x3_t m30 = load_matrix_hi(m3, 1 + m3, 2 + m3);
+ const float32x4x2_t m31 = load_matrix_lo(3 + m3, 4 + m3);
+ const float32x4x3_t m40 = load_matrix_hi(m4, 1 + m4, 2 + m4);
+ const float32x4x2_t m41 = load_matrix_lo(3 + m4, 4 + m4);
+
+ float32x4x2_t out =
+ {
+ {
+ vmulq_f32(vin0.val[0], m00.val[0]),
+ vmulq_f32(vin0.val[1], m00.val[0])
+ }
+ };
+
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);
+ out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);
+
+ out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);
+ out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);
+
+ out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);
+ out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);
+
+ out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);
+ out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);
+
+ out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);
+ out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);
+
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);
+ out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);
+
+ out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);
+ out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);
+
+ out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);
+ out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);
+
+ out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);
+ out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);
+
+ out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);
+ out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);
+
+ return out;
+}
+
+template <>
+inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
+ const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+ float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4, fixed_point_position);
+ out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
+ out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
+ out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
+ return out;
+}
+
+template <>
+inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
+ const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)
+{
+ float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4, fixed_point_position);
+ out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
+ return out;
+}
+
+template <unsigned int stridex>
float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position);
template <>
@@ -294,17 +790,22 @@
};
out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+
out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
+
out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
+
out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
+
out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
+
out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
@@ -587,15 +1088,15 @@
1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.
2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.
*/
-
for(int oz = 0; oz < num_planes_z; ++oz)
{
+ const int zoffset = id.z() + oz;
uint8_t *p_out_base = out_ptr + oz * output_stride_z;
// Step 1
{
- const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
const auto vk_r0 = load_matrix_row(ptr_k_r0);
const auto vk_r1 = load_matrix_row(ptr_k_r1);
const auto vk_r2 = load_matrix_row(ptr_k_r2);
@@ -616,17 +1117,19 @@
// Step 2
for(int p = 1; p < kernel_depth; ++p)
{
- const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
- const auto vk_r0 = load_matrix_row(ptr_k_r0);
- const auto vk_r1 = load_matrix_row(ptr_k_r1);
- const auto vk_r2 = load_matrix_row(ptr_k_r2);
+ const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;
+ const uint8_t *input_base = input_ptr + p * input_stride_z;
+ const auto ptr_k_r0 = reinterpret_cast<const T1 *>(ptr_k_base);
+ const auto ptr_k_r1 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y);
+ const auto ptr_k_r2 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2);
+ const auto vk_r0 = load_matrix_row(ptr_k_r0);
+ const auto vk_r1 = load_matrix_row(ptr_k_r1);
+ const auto vk_r2 = load_matrix_row(ptr_k_r2);
for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
{
- auto in_top = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
- auto in_mid = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
- auto in_low = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
+ auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) * input_stride_y);
+ auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) * input_stride_y);
+ auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) * input_stride_y);
auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
@@ -642,6 +1145,118 @@
}
};
+template <typename T1, typename T2, unsigned int stridex>
+class convolver_5x5
+{
+public:
+ static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+ const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ {
+ ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
+ const int input_stride_x = input->info()->strides_in_bytes().x();
+ const int input_stride_y = input->info()->strides_in_bytes().y();
+ const int input_stride_z = input->info()->strides_in_bytes().z();
+ const int output_stride_y = output->info()->strides_in_bytes().y();
+ const int output_stride_z = output->info()->strides_in_bytes().z();
+ const int kernel_stride_x = weights->info()->strides_in_bytes().x();
+ const int kernel_stride_y = weights->info()->strides_in_bytes().y();
+ const int kernel_stride_z = weights->info()->strides_in_bytes().z();
+ const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
+ const int output_w = output->info()->dimension(0);
+ const int output_h = output->info()->dimension(1);
+ const int num_planes_z = window.z().end() - window.z().start();
+ const int delta_input = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+ const int kernel_depth = weights->info()->dimension(Window::DimZ);
+ const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+ const unsigned int conv_pad_x = std::get<0>(conv_info.pad());
+ const unsigned int conv_pad_y = std::get<1>(conv_info.pad());
+ const int fixed_point_position = input->info()->fixed_point_position();
+
+ // setup output window for the iterator
+ Window window_out = window;
+ window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
+ window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+ window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
+
+ // setup input window for the iterator
+ Window window_in = window;
+ // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+ window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+
+ Iterator out(output, window_out);
+ Iterator in(input, window_in);
+ Iterator k(weights, window_k);
+
+ const uint8_t *k_ptr = k.ptr();
+
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
+ uint8_t *out_ptr = out.ptr();
+ int ih = 0;
+ int oh = 0;
+ for(int oz = 0; oz < num_planes_z; ++oz)
+ {
+ const int zoffset = id.z() + oz;
+ uint8_t *p_out_base = out_ptr + oz * output_stride_z;
+ // Step 1
+ {
+ const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
+ for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+ {
+ auto in_0 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
+ auto in_1 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
+ auto in_2 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
+ auto in_3 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y);
+ auto in_4 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y);
+ auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+ for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+ in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
+ {
+ auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4, fixed_point_position);
+ store_results<stridex>(p_out, vres);
+ }
+ }
+ }
+ // Step 2
+ for(int p = 1; p < kernel_depth; ++p)
+ {
+ const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
+
+ for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+ {
+ auto in_0 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
+ auto in_1 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
+ auto in_2 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
+ auto in_3 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y);
+ auto in_4 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y);
+ auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+ for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+ in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
+ {
+ auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4, fixed_point_position);
+ accumulate_results<stridex>(p_out, vres);
+ }
+ }
+ }
+ }
+ },
+ in, out);
+ }
+};
+
template <typename T1, typename T2>
inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
@@ -663,6 +1278,47 @@
}
}
+template <>
+inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+ const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+ const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+ if(run_optim_small_tensor(input))
+ {
+ switch(conv_stride_x)
+ {
+ case 1:
+ convolver_w1x1_i8x8_f32<1>::convolve(window, input, weights, output, conv_info);
+ break;
+ case 2:
+ convolver_w1x1_i8x8_f32<2>::convolve(window, input, weights, output, conv_info);
+ break;
+ case 3:
+ convolver_w1x1_i8x8_f32<3>::convolve(window, input, weights, output, conv_info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+ }
+ else
+ {
+ switch(conv_stride_x)
+ {
+ case 1:
+ convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ break;
+ case 2:
+ convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ break;
+ case 3:
+ convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+ }
+}
+
template <typename T1, typename T2>
inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
@@ -683,10 +1339,33 @@
ARM_COMPUTE_ERROR("Not implemented");
}
}
+
+template <typename T1, typename T2>
+inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+ const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+ const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+ switch(conv_stride_x)
+ {
+ case 1:
+ convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ break;
+ case 2:
+ convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ break;
+ case 3:
+ convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+}
+
} // namespace
NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel()
- : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_elems_read_per_iteration(0), _num_elems_written_per_iteration(0)
+ : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0),
+ _num_elems_written_per_iteration(0)
{
}
@@ -697,14 +1376,19 @@
void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),
"Pad > 0 not supported for 1x1 weights");
ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),
"Pad > 1 not supported for 3x3 weights");
+ ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 5 && (std::get<0>(conv_info.pad()) > 2 || std::get<1>(conv_info.pad()) > 2),
+ "Pad > 2 not supported for 5x5 weights");
+
ARM_COMPUTE_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
const unsigned int conv_pad_x = std::get<0>(conv_info.pad());
@@ -717,53 +1401,88 @@
_kernel_size = weights->info()->dimension(0);
_border_size = BorderSize(conv_pad_y, conv_pad_x);
- Window win = calculate_max_window(*output->info());
+ const unsigned int kernel_size = weights->info()->dimension(0);
+
+ // Get convolved dimensions
+ unsigned int output_width = 0;
+ unsigned int output_height = 0;
+ std::tie(output_width, output_height) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info);
+
+ TensorShape output_shape = input->info()->tensor_shape();
+ output_shape.set(0, output_width);
+ output_shape.set(1, output_height);
+ output_shape.set(2, weights->info()->dimension(3));
+
+ DataType data_type = input->info()->data_type();
+
+ if(is_data_type_fixed_point(data_type))
+ {
+ // Promote data type in case of fixed point
+ data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);
+ }
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, data_type, input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, output->info()->data_type());
switch(_kernel_size)
{
case 1:
{
- _num_elems_written_per_iteration = (input->info()->data_type() == DataType::QS8) ? 8 : 4;
- _num_elems_read_per_iteration = conv_stride_x * _num_elems_written_per_iteration;
-
- win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, _num_elems_read_per_iteration);
- AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
- update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ switch(input->info()->data_type())
+ {
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+ case DataType::QS8:
+ case DataType::QS16:
+ _num_elems_written_per_iteration = 8;
+ break;
+ case DataType::F32:
+ if(run_optim_small_tensor(input))
+ {
+ _num_elems_written_per_iteration = 8;
+ }
+ else
+ {
+ _num_elems_written_per_iteration = 4;
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported.");
+ break;
+ }
+ _num_weight_elems_read_per_row = kernel_size;
+ _num_elems_read_per_iteration = conv_stride_x * _num_elems_written_per_iteration;
break;
}
case 3:
+ case 5:
{
- if(input->info()->data_type() == DataType::F32)
+ switch(input->info()->data_type())
{
- _num_elems_read_per_iteration = 12;
- _num_elems_written_per_iteration = 16 >> conv_stride_x;
+ case DataType::F32:
+ _num_weight_elems_read_per_row = 4 + _kernel_size - 1;
+ _num_elems_read_per_iteration = 12;
+ _num_elems_written_per_iteration = 16 >> conv_stride_x;
+ break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+ case DataType::QS8:
+ case DataType::QS16:
+ _num_weight_elems_read_per_row = 8 + _kernel_size - 1;
+ _num_elems_read_per_iteration = 24;
+ _num_elems_written_per_iteration = 32 >> conv_stride_x;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported.");
+ break;
}
- else
- {
- _num_elems_read_per_iteration = 24;
- _num_elems_written_per_iteration = 32 >> conv_stride_x;
- }
-
- // Calculate right and bottom border
- const unsigned int conv_stride_y = std::get<1>(_conv_info.stride());
- const int input_width = input->info()->dimension(0);
- const int input_height = input->info()->dimension(1);
- const int upper_bound_w = ceil_to_multiple(((output->info()->dimension(0) - 1) * conv_stride_x + _kernel_size), _num_elems_read_per_iteration) - conv_pad_x - input_width;
- const int upper_bound_h = ((output->info()->dimension(1) - 1) * conv_stride_y - conv_pad_y + _kernel_size) - input_height;
- _border_size.right = std::max(upper_bound_w, static_cast<int>(_kernel_size));
- _border_size.bottom = std::max(upper_bound_h, static_cast<int>(_kernel_size));
-
- // Create window and update padding
- win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
- AccessWindowStatic input_access(input->info(), -conv_pad_x, -conv_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
- AccessWindowStatic weights_access(weights->info(), 0, 0, _kernel_size, _kernel_size);
- AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
- update_window_and_padding(win, input_access, weights_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
- break;
}
+ break;
default:
{
ARM_COMPUTE_ERROR("Not implemented");
@@ -771,11 +1490,27 @@
}
}
+ // Calculate right and bottom border
+ const unsigned int conv_stride_y = std::get<1>(_conv_info.stride());
+ const int input_width = input->info()->dimension(0);
+ const int input_height = input->info()->dimension(1);
+ const int upper_bound_w = ceil_to_multiple(((output->info()->dimension(0) - 1) * conv_stride_x + _kernel_size), _num_elems_read_per_iteration) - conv_pad_x - input_width;
+ const int upper_bound_h = ((output->info()->dimension(1) - 1) * conv_stride_y - conv_pad_y + _kernel_size) - input_height;
+ _border_size.right = std::max(upper_bound_w, static_cast<int>(_kernel_size));
+ _border_size.bottom = std::max(upper_bound_h, static_cast<int>(_kernel_size));
+ Window win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
+ AccessWindowStatic input_access(input->info(), -conv_pad_x, -conv_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
+ AccessWindowStatic weights_access(weights->info(), 0, 0, _num_weight_elems_read_per_row, _kernel_size);
+ AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
+ update_window_and_padding(win, input_access, weights_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
INEKernel::configure(win);
}
-void NEDirectConvolutionLayerKernel::run(const Window &window)
+void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
@@ -786,31 +1521,66 @@
{
case 1:
{
- if(_input->info()->data_type() == DataType::QS8)
+ switch(_input->info()->data_type())
{
- convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- }
- else
- {
- convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ case DataType::QS8:
+ convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+ case DataType::QS16:
+ convolve_1x1<qint16_t, qint32_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+ case DataType::F32:
+ convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
}
break;
}
case 3:
{
- if(_input->info()->data_type() == DataType::QS8)
+ switch(_input->info()->data_type())
{
- convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- }
- else
- {
- convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ case DataType::QS8:
+ convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+ case DataType::F32:
+ convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
}
break;
}
+ case 5:
+ {
+ switch(_input->info()->data_type())
+ {
+ case DataType::F32:
+ convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
+ break;
+ }
+
default:
{
- ARM_COMPUTE_ERROR("Only kernel sizes 1x1 and 3x3 are supported.");
+ ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
break;
}
}
diff --git a/src/core/NEON/kernels/NEErodeKernel.cpp b/src/core/NEON/kernels/NEErodeKernel.cpp
index 3985036..88c20f8 100644
--- a/src/core/NEON/kernels/NEErodeKernel.cpp
+++ b/src/core/NEON/kernels/NEErodeKernel.cpp
@@ -67,8 +67,9 @@
INEKernel::configure(win);
}
-void NEErodeKernel::run(const Window &window)
+void NEErodeKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEFastCornersKernel.cpp b/src/core/NEON/kernels/NEFastCornersKernel.cpp
index 9e8b552..919efd2 100644
--- a/src/core/NEON/kernels/NEFastCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEFastCornersKernel.cpp
@@ -388,8 +388,9 @@
INEKernel::configure(win);
}
-void NEFastCornersKernel::run(const Window &window)
+void NEFastCornersKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEFillArrayKernel.cpp b/src/core/NEON/kernels/NEFillArrayKernel.cpp
index 7e7e1c2..5a2e1a0 100644
--- a/src/core/NEON/kernels/NEFillArrayKernel.cpp
+++ b/src/core/NEON/kernels/NEFillArrayKernel.cpp
@@ -62,8 +62,9 @@
return false;
}
-void NEFillArrayKernel::run(const Window &window)
+void NEFillArrayKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index bd99242..9505a25 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
@@ -35,6 +36,63 @@
using namespace arm_compute;
+namespace
+{
+template <typename T, unsigned int leftx, unsigned int rightx>
+void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value);
+
+template <>
+inline void fill_constant_value_single_channel_special<float, 1u, 1u>(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
+{
+ float border_value;
+ constant_border_value.get(border_value);
+ uint8_t *const start_valid_region = tensor->ptr_to_element(tensor->info()->valid_region().anchor);
+ const size_t &width = tensor->info()->valid_region().shape[0];
+ const size_t &height = tensor->info()->valid_region().shape[1];
+ const int stridey = tensor->info()->strides_in_bytes()[1];
+
+ // Left and right border
+ Window vertical(window);
+ vertical.set(Window::DimY, Window::Dimension(0, height, 1));
+
+ Iterator vertical_it(tensor, vertical);
+
+ execute_window_loop(vertical, [&](const Coordinates &)
+ {
+ const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
+
+ // Fill left and right borders
+ *(row_start - 1) = border_value;
+ std::fill_n(row_start + width, right, border_value);
+ },
+ vertical_it);
+
+ // Top and bottom border
+ Iterator plane_it(tensor, window);
+
+ // Iterate over all XY planes
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ uint8_t *base_addr = start_valid_region + plane_it.offset();
+ // Top border
+ const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
+ // Fill top rows including left/right borders
+ std::fill_n(row_start - 1, 1 + width + right, border_value);
+
+ // Bottom border
+ const unsigned low_border_size = height + bottom;
+ for(unsigned int i = height; i < low_border_size; ++i)
+ {
+ const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
+
+ // Fill bottom rows including left/right borders
+ std::fill_n(row_start - 1, 1 + width + right, border_value);
+ }
+ },
+ plane_it);
+}
+} // namespace
+
namespace arm_compute
{
class Coordinates;
@@ -47,7 +105,7 @@
void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
_tensor = tensor;
_border_size = border_size;
@@ -59,12 +117,14 @@
Window win;
win.set(Window::DimX, Window::Dimension(0, 1, 1));
win.set(Window::DimY, Window::Dimension(0, 1, 1));
- win.use_tensor_dimensions(_tensor->info(), Window::DimZ);
+ win.use_tensor_dimensions(_tensor->info()->tensor_shape(), Window::DimZ);
INEKernel::configure(win);
}
-void NEFillBorderKernel::run(const Window &window)
+void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
+
// If there is no border: early exit
if(_border_size.empty())
{
@@ -100,9 +160,20 @@
case DataType::S32:
fill_constant_value_single_channel<int32_t>(window);
break;
+ case DataType::F16:
+ static_assert(sizeof(half) == 2, "Float16_t must be 16 bit");
+ fill_constant_value_single_channel<half>(window);
+ break;
case DataType::F32:
static_assert(sizeof(float) == 4, "Float must be 32 bit");
- fill_constant_value_single_channel<float>(window);
+ if(_border_size.left == 1 && _border_size.top == 1)
+ {
+ fill_constant_value_single_channel_special<float, 1u, 1u>(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
+ }
+ else
+ {
+ fill_constant_value_single_channel<float>(window);
+ }
break;
default:
ARM_COMPUTE_ERROR("Not handled");
@@ -133,6 +204,10 @@
case DataType::S32:
fill_replicate_single_channel<int32_t>(window);
break;
+ case DataType::F16:
+ static_assert(sizeof(half) == 2, "Float16_t must be 16 bit");
+ fill_replicate_single_channel<half>(window);
+ break;
case DataType::F32:
static_assert(sizeof(float) == 4, "Float must be 32 bit");
fill_replicate_single_channel<float>(window);
@@ -214,6 +289,7 @@
uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
const size_t &width = _tensor->info()->valid_region().shape[0];
const size_t &height = _tensor->info()->valid_region().shape[1];
+ const int stridey = _tensor->info()->strides_in_bytes()[1];
// Left and right border
Window vertical(window);
@@ -237,19 +313,21 @@
// Iterate over all XY planes
execute_window_loop(window, [&](const Coordinates & id)
{
+ uint8_t *base_addr = start_valid_region + plane_it.offset();
// Top border
for(int i = -_border_size.top; i < 0; ++i)
{
- const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
+ const auto row_start = reinterpret_cast<T *>(base_addr + i * stridey);
// Fill top rows including left/right borders
std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value);
}
// Bottom border
- for(unsigned int i = height; i < height + _border_size.bottom; ++i)
+ const unsigned low_border_size = height + _border_size.bottom;
+ for(unsigned int i = height; i < low_border_size; ++i)
{
- const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
+ const auto row_start = reinterpret_cast<T *>(base_addr + i * stridey);
// Fill bottom rows including left/right borders
std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value);
diff --git a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
index 699a5d9..017e259 100644
--- a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
@@ -57,12 +57,13 @@
Window win;
win.set(Window::DimX, Window::Dimension(0, 1, 1));
win.set(Window::DimY, Window::Dimension(0, 1, 1));
- win.use_tensor_dimensions(_tensor->info(), Window::DimZ);
+ win.use_tensor_dimensions(_tensor->info()->tensor_shape(), Window::DimZ);
INEKernel::configure(win);
}
-void NEFillInnerBorderKernel::run(const Window &window)
+void NEFillInnerBorderKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
new file mode 100644
index 0000000..dd85ac1
--- /dev/null
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+void NEFloorKernel::configure(const ITensor *input, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ set_data_type_if_unknown(*input->info(), DataType::F32);
+ set_data_type_if_unknown(*output->info(), DataType::F32);
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ _input = input;
+ _output = output;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ INEKernel::configure(win);
+}
+
+void NEFloorKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float32x4_t res = vfloorq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
+ },
+ input, output);
+}
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 3ff8b7b..ae5d456 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -85,10 +85,10 @@
const uint16x4x4_t data =
{
{
- vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 0 * in_stride)),
- vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 1 * in_stride)),
- vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 2 * in_stride)),
- vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 3 * in_stride)),
+ vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 0 * in_stride)),
+ vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 1 * in_stride)),
+ vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 2 * in_stride)),
+ vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 3 * in_stride)),
}
};
vst4_u16(reinterpret_cast<uint16_t *>(out.ptr()), data);
@@ -113,10 +113,10 @@
const uint32x4x4_t data =
{
{
- vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 0 * in_stride)),
- vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 1 * in_stride)),
- vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 2 * in_stride)),
- vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 3 * in_stride))
+ vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 0 * in_stride)),
+ vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 1 * in_stride)),
+ vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 2 * in_stride)),
+ vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 3 * in_stride))
}
};
vst4q_u32(reinterpret_cast<uint32_t *>(out.ptr()), data);
@@ -132,11 +132,20 @@
void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+ TensorShape output_shape = input->info()->tensor_shape();
+ output_shape.set(0, input->info()->dimension(0) * 4);
+ output_shape.set(1, std::ceil(input->info()->dimension(1) / 4.0f));
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4);
- ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(input->info()->dimension(1) / 4.0f));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
_input = input;
_output = output;
@@ -173,8 +182,9 @@
INEKernel::configure(win);
}
-void NEGEMMInterleave4x4Kernel::run(const Window &window)
+void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index 3558c68..cbba446 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -81,8 +81,9 @@
INEKernel::configure(win);
}
-void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window)
+void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
index 7a3bae5..fb07cb0 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
@@ -45,10 +45,10 @@
void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
- ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(biases, accum);
+ ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
_biases = biases;
_accum = accum;
@@ -58,11 +58,9 @@
// Configure kernel window
Window win = calculate_max_window(*accum->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowStatic biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1));
-
update_window_and_padding(win,
AccessWindowHorizontal(accum->info(), 0, num_elems_processed_per_iteration),
- biases_access);
+ AccessWindowStatic(biases->info(), 0, 0, win.x().end(), biases->info()->tensor_shape().y()));
AccessWindowHorizontal output_access(accum->info(), 0, num_elems_processed_per_iteration);
@@ -74,8 +72,9 @@
INEKernel::configure(win);
}
-void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window)
+void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
@@ -109,6 +108,27 @@
in0_out, in1);
break;
}
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float16x8x2_t accum = vld2q_f16(reinterpret_cast<const float16_t *>(in0_out.ptr()));
+ const float16x8x2_t biases = vld2q_f16(reinterpret_cast<const float16_t *>(in1.ptr()));
+ const float16x8x2_t res =
+ {
+ {
+ vaddq_f16(accum.val[0], biases.val[0]),
+ vaddq_f16(accum.val[1], biases.val[1])
+ }
+ };
+
+ vst2q_f16(reinterpret_cast<float16_t *>(in0_out.ptr()), res);
+ },
+ in0_out, in1);
+ break;
+ }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
case DataType::QS8:
{
execute_window_loop(window, [&](const Coordinates & id)
@@ -121,6 +141,21 @@
in0_out, in1);
break;
}
+ case DataType::QS16:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ qint16x8x2_t accum = vld2q_s16(reinterpret_cast<const qint16_t *>(in0_out.ptr()));
+ const qint16x8x2_t biases = vld2q_s16(reinterpret_cast<const qint16_t *>(in1.ptr()));
+
+ accum.val[0] = vqaddq_qs16(accum.val[0], biases.val[0]);
+ accum.val[1] = vqaddq_qs16(accum.val[1], biases.val[1]);
+
+ vst2q_s16(reinterpret_cast<qint16_t *>(in0_out.ptr()), accum);
+ },
+ in0_out, in1);
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Data type not supported");
break;
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index 71dd4c7..9dbce1d 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -52,25 +52,8 @@
const auto in_ptr = reinterpret_cast<const float *>(in.ptr());
const auto out_ptr = reinterpret_cast<float *>(out.ptr());
- float32x4x4_t alpha_ab =
- {
- {
- vld1q_f32(out_ptr + 0),
- vld1q_f32(out_ptr + 4),
- vld1q_f32(out_ptr + 8),
- vld1q_f32(out_ptr + 12)
- }
- };
-
- const float32x4x4_t c =
- {
- {
- vld1q_f32(in_ptr + 0),
- vld1q_f32(in_ptr + 4),
- vld1q_f32(in_ptr + 8),
- vld1q_f32(in_ptr + 12)
- }
- };
+ float32x4x4_t alpha_ab = vld4q_f32(out_ptr);
+ const float32x4x4_t c = vld4q_f32(in_ptr);
// Multiply matrix C by its weight and accumulate
alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
@@ -78,10 +61,7 @@
alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
- vst1q_f32(out_ptr + 0, alpha_ab.val[0]);
- vst1q_f32(out_ptr + 4, alpha_ab.val[1]);
- vst1q_f32(out_ptr + 8, alpha_ab.val[2]);
- vst1q_f32(out_ptr + 12, alpha_ab.val[3]);
+ vst4q_f32(out_ptr, alpha_ab);
},
in, out);
}
@@ -99,37 +79,22 @@
const auto in_ptr = reinterpret_cast<const float16_t *>(in.ptr());
const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
- float16x8x2_t alpha_ab =
- {
- {
- vld1q_f16(out_ptr + 0),
- vld1q_f16(out_ptr + 8)
- }
- };
-
- float16x8x2_t c =
- {
- {
- vld1q_f16(in_ptr + 0),
- vld1q_f16(in_ptr + 8)
- }
- };
-
+ float16x8x2_t alpha_ab = vld2q_f16(out_ptr);
+ const float16x8x2_t c = vld2q_f16(in_ptr);
// Multiply matrix C by its weight and accumulate
alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
- vst1q_f16(out_ptr + 0, alpha_ab.val[0]);
- vst1q_f16(out_ptr + 8, alpha_ab.val[1]);
+ vst2q_f16(out_ptr + 0, alpha_ab);
},
in, out);
}
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta)
{
const int fixed_point_position = input->info()->fixed_point_position();
- const qint8x16_t beta_qs8 = vdupq_n_qs8(scvt_qs8_f32(beta, fixed_point_position));
+ const qint8x16_t beta_qs8 = vdupq_n_qs8(sqcvt_qs8_f32(beta, fixed_point_position));
Iterator in(input, window);
Iterator out(output, window);
@@ -149,6 +114,31 @@
},
in, out);
}
+
+void matrix_addition_qs16(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+ const int fixed_point_position = input->info()->fixed_point_position();
+ const qint16x8_t beta_qs16 = vdupq_n_qs16(sqcvt_qs16_f32(beta, fixed_point_position));
+
+ Iterator in(input, window);
+ Iterator out(output, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const qint16_t *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<qint16_t *>(out.ptr());
+
+ qint16x8x2_t alpha_ab = vld2q_s16(out_ptr);
+ const qint16x8x2_t c = vld2q_s16(in_ptr);
+
+ // Multiply matrix C by its weight and accumulate
+ alpha_ab.val[0] = vqmlaq_qs16(alpha_ab.val[0], c.val[0], beta_qs16, fixed_point_position);
+ alpha_ab.val[1] = vqmlaq_qs16(alpha_ab.val[1], c.val[1], beta_qs16, fixed_point_position);
+
+ vst2q_s16(out_ptr, alpha_ab);
+ },
+ in, out);
+}
} // namespace
NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel()
@@ -158,8 +148,8 @@
void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
@@ -173,11 +163,14 @@
case DataType::QS8:
_func = &matrix_addition_qs8;
break;
+ case DataType::QS16:
+ _func = &matrix_addition_qs16;
+ break;
case DataType::F16:
#ifdef ARM_COMPUTE_ENABLE_FP16
_func = &matrix_addition_f16;
break;
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
default:
ARM_COMPUTE_ERROR("Data type not supported");
break;
@@ -190,8 +183,9 @@
_beta = beta;
}
-void NEGEMMMatrixAdditionKernel::run(const Window &window)
+void NEGEMMMatrixAdditionKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index dcfbb13..6909082 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/AccessWindowTranspose.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -50,15 +51,162 @@
namespace
{
template <bool multiply_alpha>
-void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+void vector_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));
+ const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+ const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
+
+ // The implementation computes 32 elements per iteration
+ const int window_start_x = 32 * info.thread_id;
+ const int window_step_x = 32 * info.num_threads;
+ const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+ ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");
+
+ Window win_out(window);
+ win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+ win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ Window win_a(window);
+ win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ Window win_b;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+ if(input1->info()->num_dimensions() >= 3)
+ {
+ win_b = window;
+ }
+ win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+ win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ Iterator ina(input0, win_a);
+ Iterator inb(input1, win_b);
+ Iterator out(output, win_out);
+
+ const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
+ ARM_COMPUTE_UNUSED(alpha_f16);
+
+ execute_window_loop(win_out, [&](const Coordinates & id)
+ {
+ if(id.x() > width_matrix_b)
+ {
+ return;
+ }
+
+ float16x8_t acc0 = vdupq_n_f16(0.f);
+ float16x8_t acc1 = vdupq_n_f16(0.f);
+ float16x8_t acc2 = vdupq_n_f16(0.f);
+ float16x8_t acc3 = vdupq_n_f16(0.f);
+
+ auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr());
+ auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr());
+
+ const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+ for(; vec_a <= (vec_a_end_addr - 4);)
+ {
+ const float16x4_t a0l = vld1_f16(vec_a);
+
+ float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+ float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+ float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+ float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+ float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+ float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+ float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+ float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+ acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
+ acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
+ acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
+ acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
+ acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
+ acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
+ acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
+ acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
+
+ matrix_b += 2 * in_b_stride;
+
+ b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+ b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+ b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+ b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+ b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+ b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+ b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+ b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+ acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
+ acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
+ acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
+ acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
+ acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
+ acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
+ acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
+ acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
+
+ vec_a += 4;
+ matrix_b += 2 * in_b_stride;
+ }
+
+ for(; vec_a < vec_a_end_addr;)
+ {
+ const float16_t a0 = *vec_a;
+ const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+ const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+ const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+ const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+
+ acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
+ acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
+ acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
+ acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
+
+ vec_a += 1;
+ matrix_b += in_b_stride;
+ }
+
+ // Multiply by the weight of matrix product (alpha)
+ if(multiply_alpha)
+ {
+ acc0 = vmulq_f16(acc0, alpha_f16);
+ acc1 = vmulq_f16(acc1, alpha_f16);
+ acc2 = vmulq_f16(acc2, alpha_f16);
+ acc3 = vmulq_f16(acc3, alpha_f16);
+ }
+
+ const auto vec_out = reinterpret_cast<float16_t *>(out.ptr());
+
+ vst1q_f16(vec_out + 0, acc0);
+ vst1q_f16(vec_out + 8, acc1);
+ vst1q_f16(vec_out + 16, acc2);
+ vst1q_f16(vec_out + 24, acc3);
+
+ },
+ ina, inb, out);
+#else /* ARM_COMPUTE_ENABLE_FP16 */
+ ARM_COMPUTE_UNUSED(input0);
+ ARM_COMPUTE_UNUSED(input1);
+ ARM_COMPUTE_UNUSED(output);
+ ARM_COMPUTE_UNUSED(window);
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_ERROR("Not implemented");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
+template <bool multiply_alpha>
+void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
{
const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));
const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
// The implementation computes 16 elements per iteration
- const int window_start_x = 16 * window.thread_id();
- const int window_step_x = 16 * window.num_threads();
+ const int window_start_x = 16 * info.thread_id;
+ const int window_step_x = 16 * info.num_threads;
// Make sure (window_end_x - window_start_x) is a multiple of window_step_x
const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
@@ -103,7 +251,7 @@
asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
-#endif
+#endif /* __arm__ */
auto vec_a_end_addr = vec_a + num_elems_vec_a;
for(; vec_a <= (vec_a_end_addr - 4);)
@@ -126,7 +274,7 @@
asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
-#endif
+#endif /* __arm__ */
acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
@@ -206,7 +354,7 @@
}
template <bool multiply_alpha>
-void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
{
const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));
const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
@@ -214,8 +362,8 @@
const int fixed_point_position = input0->info()->fixed_point_position();
// The implementation computes 32 elements per iteration
- const int window_start_x = 32 * window.thread_id();
- const int window_step_x = 32 * window.num_threads();
+ const int window_start_x = 32 * info.thread_id;
+ const int window_step_x = 32 * info.num_threads;
// Make sure (window_end_x - window_start_x) is a multiple of window_step_x
const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
@@ -315,7 +463,7 @@
// Multiply by the weight of the matrix product (alpha)
if(multiply_alpha)
{
- const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+ const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));
acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
@@ -334,6 +482,135 @@
}
template <bool multiply_alpha>
+void vector_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
+{
+ const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));
+ const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+ const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
+ const int fixed_point_position = input0->info()->fixed_point_position();
+
+ // The implementation computes 16 elements per iteration
+ const int window_start_x = 16 * info.thread_id;
+ const int window_step_x = 16 * info.num_threads;
+ // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+ const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+ ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");
+
+ Window win_out(window);
+ win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+ win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ Window win_a(window);
+ win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ Window win_b;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+ if(input1->info()->num_dimensions() >= 3)
+ {
+ win_b = window;
+ }
+ win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+ win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ Iterator ina(input0, win_a);
+ Iterator inb(input1, win_b);
+ Iterator out(output, win_out);
+
+ execute_window_loop(win_out, [&](const Coordinates & id)
+ {
+ if(id.x() > width_matrix_b)
+ {
+ return;
+ }
+
+ // Reset accumulators
+ qint32x4_t acc00_qs32 = vdupq_n_qs32(0);
+ qint32x4_t acc01_qs32 = vdupq_n_qs32(0);
+ qint32x4_t acc02_qs32 = vdupq_n_qs32(0);
+ qint32x4_t acc03_qs32 = vdupq_n_qs32(0);
+
+ auto vec_a = reinterpret_cast<const qint16_t *>(ina.ptr());
+ auto matrix_b = reinterpret_cast<const qint16_t *>(inb.ptr());
+
+ auto vec_a_end_addr = vec_a + num_elems_vec_a;
+ for(; vec_a <= (vec_a_end_addr - 2);)
+ {
+ const qint16x4_t a0 = vld1_dup_qs16(vec_a + 0);
+ const qint16x4_t a1 = vld1_dup_qs16(vec_a + 1);
+
+ const qint16x4_t b00 = vld1_qs16(matrix_b + 0 + 0 * in_b_stride);
+ const qint16x4_t b01 = vld1_qs16(matrix_b + 4 + 0 * in_b_stride);
+ const qint16x4_t b02 = vld1_qs16(matrix_b + 8 + 0 * in_b_stride);
+ const qint16x4_t b03 = vld1_qs16(matrix_b + 12 + 0 * in_b_stride);
+ const qint16x4_t b10 = vld1_qs16(matrix_b + 0 + 1 * in_b_stride);
+ const qint16x4_t b11 = vld1_qs16(matrix_b + 4 + 1 * in_b_stride);
+ const qint16x4_t b12 = vld1_qs16(matrix_b + 8 + 1 * in_b_stride);
+ const qint16x4_t b13 = vld1_qs16(matrix_b + 12 + 1 * in_b_stride);
+
+ // First accumulation
+ acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);
+ acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);
+ acc02_qs32 = vqmlal_qs16(acc02_qs32, b02, a0, fixed_point_position);
+ acc03_qs32 = vqmlal_qs16(acc03_qs32, b03, a0, fixed_point_position);
+
+ // Second accumulation
+ acc00_qs32 = vqmlal_qs16(acc00_qs32, b10, a1, fixed_point_position);
+ acc01_qs32 = vqmlal_qs16(acc01_qs32, b11, a1, fixed_point_position);
+ acc02_qs32 = vqmlal_qs16(acc02_qs32, b12, a1, fixed_point_position);
+ acc03_qs32 = vqmlal_qs16(acc03_qs32, b13, a1, fixed_point_position);
+
+ vec_a += 2;
+ matrix_b += 2 * in_b_stride;
+ }
+
+ for(; vec_a < vec_a_end_addr;)
+ {
+ const qint16x4_t a0 = vld1_dup_qs16(vec_a);
+
+ const qint16x4_t b00 = vld1_qs16(matrix_b + 0);
+ const qint16x4_t b01 = vld1_qs16(matrix_b + 4);
+ const qint16x4_t b02 = vld1_qs16(matrix_b + 8);
+ const qint16x4_t b03 = vld1_qs16(matrix_b + 12);
+
+ acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);
+ acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);
+ acc02_qs32 = vqmlal_qs16(acc02_qs32, b02, a0, fixed_point_position);
+ acc03_qs32 = vqmlal_qs16(acc03_qs32, b03, a0, fixed_point_position);
+
+ vec_a += 1;
+ matrix_b += in_b_stride;
+ }
+
+ // Convert back to qint16x4_t and saturate
+ qint16x4_t acc00_qs16 = vqmovn_qs32(acc00_qs32);
+ qint16x4_t acc01_qs16 = vqmovn_qs32(acc01_qs32);
+ qint16x4_t acc02_qs16 = vqmovn_qs32(acc02_qs32);
+ qint16x4_t acc03_qs16 = vqmovn_qs32(acc03_qs32);
+
+ // Multiply by the weight of the matrix product (alpha)
+ if(multiply_alpha)
+ {
+ const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));
+ acc00_qs16 = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position);
+ acc01_qs16 = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position);
+ acc02_qs16 = vqmul_qs16(acc02_qs16, alpha_qs16, fixed_point_position);
+ acc03_qs16 = vqmul_qs16(acc03_qs16, alpha_qs16, fixed_point_position);
+ }
+
+ const auto mtx_out0 = reinterpret_cast<qint16_t *>(out.ptr());
+
+ // Store 16x4 output elements
+ vst1_qs16(mtx_out0 + 0, acc00_qs16);
+ vst1_qs16(mtx_out0 + 4, acc01_qs16);
+ vst1_qs16(mtx_out0 + 8, acc02_qs16);
+ vst1_qs16(mtx_out0 + 12, acc03_qs16);
+ },
+ ina, inb, out);
+}
+
+template <bool multiply_alpha>
void matrix_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
{
const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
@@ -386,7 +663,7 @@
asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif
+#endif /* __arm__ */
auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
@@ -405,7 +682,7 @@
asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif
+#endif /* __arm__ */
// 4x4 block 0
acc00 = vmlaq_f32(acc00, b00, a0);
@@ -496,7 +773,7 @@
asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif
+#endif /* __arm__ */
// 4x4 block 0
acc00 = vmlaq_f32(acc00, b00, a0);
@@ -587,7 +864,7 @@
asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif
+#endif /* __arm__ */
// 4x4 block 0
acc00 = vmlaq_f32(acc00, b00, a0);
acc10 = vmlaq_f32(acc10, b00, a1);
@@ -639,8 +916,9 @@
void matrix_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
{
#ifdef ARM_COMPUTE_ENABLE_FP16
- const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
- const size_t out_stride = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
+ const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
+ const size_t out_stride = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
+ const int num_elems_matrix_b_x = input1->info()->dimension(0);
// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
Window win_a(window);
@@ -662,9 +940,6 @@
Iterator inb(input1, win_b);
Iterator out(output, window);
- // Number of iterations of inner loop. Since 8 is the number of accumulations per loop, num_it = (width_mtx_b / 4) / 8
- const size_t num_it = ((input1->info()->dimension(0)) >> 2) >> 3;
-
const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
execute_window_loop(window, [&](const Coordinates & id)
@@ -710,10 +985,14 @@
The size of the output tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size.
*/
- for(size_t k = num_it; k > 0; mtx_a0 += 16, mtx_b0 += 32, --k)
+ const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
+
+ for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
+
{
const float16x8_t p00 = vld1q_f16(mtx_a0);
const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
+
const float16x8_t q00 = vld1q_f16(mtx_b0);
const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
@@ -738,6 +1017,24 @@
c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
+
+ mtx_a0 += 16;
+ mtx_b0 += 32;
+ }
+
+ for(; mtx_b0 < mtx_b0_end_addr;)
+
+ {
+ const float16x4_t p00 = vld1_f16(mtx_a0);
+ const float16x8_t q00 = vld1q_f16(mtx_b0);
+
+ c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));
+ c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));
+ c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));
+ c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));
+
+ mtx_a0 += 4;
+ mtx_b0 += 8;
}
if(multiply_alpha)
@@ -754,9 +1051,14 @@
vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
},
ina, inb, out);
-#else
+#else /* ARM_COMPUTE_ENABLE_FP16 */
+ ARM_COMPUTE_UNUSED(input0);
+ ARM_COMPUTE_UNUSED(input1);
+ ARM_COMPUTE_UNUSED(output);
+ ARM_COMPUTE_UNUSED(window);
+ ARM_COMPUTE_UNUSED(alpha);
ARM_COMPUTE_ERROR("Not implemented");
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
}
template <bool multiply_alpha>
@@ -768,7 +1070,7 @@
const size_t out_stride3 = out_stride1 * 3;
const int num_elems_matrix_b_x = input1->info()->dimension(0);
const int fixed_point_position = input0->info()->fixed_point_position();
- const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+ const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));
ARM_COMPUTE_UNUSED(alpha_qs8);
// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
@@ -867,7 +1169,7 @@
asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif
+#endif /* __arm__ */
// Second accumulation
acc00_qs16 = vqmlal_qs8(acc00_qs16, b02, a4, fixed_point_position);
@@ -992,6 +1294,120 @@
ina, inb, out);
}
+template <bool multiply_alpha>
+void matrix_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+ const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
+ const size_t out_stride1 = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
+ const size_t out_stride2 = out_stride1 * 2;
+ const size_t out_stride3 = out_stride1 * 3;
+ const int num_elems_matrix_b_x = input1->info()->dimension(0);
+ const int fixed_point_position = input0->info()->fixed_point_position();
+ const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));
+ ARM_COMPUTE_UNUSED(alpha_qs16);
+
+ // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+ Window win_a(window);
+ win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
+
+ Window win_b;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+ if(input1->info()->num_dimensions() >= 3)
+ {
+ win_b = window;
+ }
+ // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix
+ win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride));
+ win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ Iterator ina(input0, win_a);
+ Iterator inb(input1, win_b);
+ Iterator out(output, window);
+
+ // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
+ // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 8x4 elements per iteration
+ // All the values needed for computing a single 8x4 block will be read from consecutive memory positions
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ auto mtx_a0 = reinterpret_cast<const qint16_t *>(ina.ptr());
+ auto mtx_b0 = reinterpret_cast<const qint16_t *>(inb.ptr());
+ auto mtx_b1 = mtx_b0 + in_b_stride;
+
+ qint32x4_t acc00_qs32 = vdupq_n_qs32(0);
+ qint32x4_t acc10_qs32 = vdupq_n_qs32(0);
+ qint32x4_t acc20_qs32 = vdupq_n_qs32(0);
+ qint32x4_t acc30_qs32 = vdupq_n_qs32(0);
+
+ qint32x4_t acc01_qs32 = vdupq_n_qs32(0);
+ qint32x4_t acc11_qs32 = vdupq_n_qs32(0);
+ qint32x4_t acc21_qs32 = vdupq_n_qs32(0);
+ qint32x4_t acc31_qs32 = vdupq_n_qs32(0);
+
+ // This for loop performs 1 accumulation
+ for(int k = 0; k <= (num_elems_matrix_b_x - 8); k += 8)
+ {
+ const qint16x4_t a0 = vld1_dup_qs16(mtx_a0 + 0);
+ const qint16x4_t a1 = vld1_dup_qs16(mtx_a0 + 1);
+ const qint16x4_t a2 = vld1_dup_qs16(mtx_a0 + 2);
+ const qint16x4_t a3 = vld1_dup_qs16(mtx_a0 + 3);
+
+ const qint16x4_t b00 = vld1_qs16(mtx_b0 + 0);
+ const qint16x4_t b01 = vld1_qs16(mtx_b0 + 4);
+
+ acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);
+ acc10_qs32 = vqmlal_qs16(acc10_qs32, b00, a1, fixed_point_position);
+ acc20_qs32 = vqmlal_qs16(acc20_qs32, b00, a2, fixed_point_position);
+ acc30_qs32 = vqmlal_qs16(acc30_qs32, b00, a3, fixed_point_position);
+ acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);
+ acc11_qs32 = vqmlal_qs16(acc11_qs32, b01, a1, fixed_point_position);
+ acc21_qs32 = vqmlal_qs16(acc21_qs32, b01, a2, fixed_point_position);
+ acc31_qs32 = vqmlal_qs16(acc31_qs32, b01, a3, fixed_point_position);
+
+ mtx_a0 += 4;
+ mtx_b0 += 8;
+ mtx_b1 += 8;
+ }
+
+ // Convert back to qint16x4_t and saturate
+ qint16x4_t acc00_qs16 = vqmovn_qs32(acc00_qs32);
+ qint16x4_t acc10_qs16 = vqmovn_qs32(acc10_qs32);
+ qint16x4_t acc20_qs16 = vqmovn_qs32(acc20_qs32);
+ qint16x4_t acc30_qs16 = vqmovn_qs32(acc30_qs32);
+
+ qint16x4_t acc01_qs16 = vqmovn_qs32(acc01_qs32);
+ qint16x4_t acc11_qs16 = vqmovn_qs32(acc11_qs32);
+ qint16x4_t acc21_qs16 = vqmovn_qs32(acc21_qs32);
+ qint16x4_t acc31_qs16 = vqmovn_qs32(acc31_qs32);
+
+ // Multiply by the weight of the matrix product (alpha)
+ if(multiply_alpha)
+ {
+ acc00_qs16 = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position);
+ acc10_qs16 = vqmul_qs16(acc10_qs16, alpha_qs16, fixed_point_position);
+ acc20_qs16 = vqmul_qs16(acc20_qs16, alpha_qs16, fixed_point_position);
+ acc30_qs16 = vqmul_qs16(acc30_qs16, alpha_qs16, fixed_point_position);
+ acc01_qs16 = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position);
+ acc11_qs16 = vqmul_qs16(acc11_qs16, alpha_qs16, fixed_point_position);
+ acc21_qs16 = vqmul_qs16(acc21_qs16, alpha_qs16, fixed_point_position);
+ acc31_qs16 = vqmul_qs16(acc31_qs16, alpha_qs16, fixed_point_position);
+ }
+
+ const auto mtx_out0 = reinterpret_cast<qint16_t *>(out.ptr());
+
+ // Store 8x4 output elements
+ vst1_qs16(mtx_out0 + 0, acc00_qs16);
+ vst1_qs16(mtx_out0 + 4, acc01_qs16);
+ vst1_qs16(mtx_out0 + out_stride1 + 0, acc10_qs16);
+ vst1_qs16(mtx_out0 + out_stride1 + 4, acc11_qs16);
+ vst1_qs16(mtx_out0 + out_stride2 + 0, acc20_qs16);
+ vst1_qs16(mtx_out0 + out_stride2 + 4, acc21_qs16);
+ vst1_qs16(mtx_out0 + out_stride3 + 0, acc30_qs16);
+ vst1_qs16(mtx_out0 + out_stride3 + 4, acc31_qs16);
+ },
+ ina, inb, out);
+}
} // namespace
NEGEMMMatrixMultiplyKernel::NEGEMMMatrixMultiplyKernel()
@@ -1001,10 +1417,7 @@
void NEGEMMMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32, DataType::QS8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8, DataType::QS16);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
@@ -1036,6 +1449,18 @@
num_elems_processed_per_iteration_x = 32;
break;
}
+ case DataType::QS16:
+ {
+ num_elems_processed_per_iteration_x = 16;
+ break;
+ }
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ {
+ num_elems_processed_per_iteration_x = 32;
+ break;
+ }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
default:
{
ARM_COMPUTE_ERROR("Data type not supported");
@@ -1049,7 +1474,7 @@
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x);
update_window_and_padding(win,
- AccessWindowHorizontal(input0->info(), 0, num_elems_processed_per_iteration_x),
+ AccessWindowStatic(input0->info(), 0, 0, input0->info()->tensor_shape().x(), 1),
AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
output_access);
@@ -1073,13 +1498,18 @@
num_elems_processed_per_iteration_x = 32;
break;
}
- case DataType::F16:
+ case DataType::QS16:
{
-#ifdef ARM_COMPUTE_ENABLE_FP16
num_elems_processed_per_iteration_x = 8;
break;
-#endif
}
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ {
+ num_elems_processed_per_iteration_x = 8;
+ break;
+ }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
default:
{
ARM_COMPUTE_ERROR("Data type not supported");
@@ -1094,7 +1524,7 @@
update_window_and_padding(win,
AccessWindowRectangle(input0->info(), 0, 0, 4, 1, 1.f, 0.25f),
- AccessWindowTranspose(input1->info(), 0, 0, 4, 1, 0.f, 0.25f),
+ AccessWindowStatic(input1->info(), 0, 0, input1->info()->tensor_shape().x(), ceil_to_multiple(input1->info()->tensor_shape().y(), 4)),
output_access);
output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
@@ -1103,30 +1533,44 @@
}
}
-void NEGEMMMatrixMultiplyKernel::run(const Window &window)
+void NEGEMMMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
bool multiply_alpha = std::abs(1.0f - _alpha) > 0.00001f;
- // Check if the output tensor is a vector and the data type is F32. If so,the kernel runs the vector-matrix multiplication
+ // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
if((_output->info()->dimension(1) == 1))
{
switch(_input0->info()->data_type())
{
case DataType::F32:
{
- multiply_alpha ? vector_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha) :
- vector_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
+ multiply_alpha ? vector_matrix_multiply_f32<true>(_input0, _input1, _output, window, info, _alpha) :
+ vector_matrix_multiply_f32<false>(_input0, _input1, _output, window, info, _alpha);
break;
}
case DataType::QS8:
{
- multiply_alpha ? vector_matrix_multiply_qs8<true>(_input0, _input1, _output, window, _alpha) :
- vector_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);
+ multiply_alpha ? vector_matrix_multiply_qs8<true>(_input0, _input1, _output, window, info, _alpha) :
+ vector_matrix_multiply_qs8<false>(_input0, _input1, _output, window, info, _alpha);
break;
}
+ case DataType::QS16:
+ {
+ multiply_alpha ? vector_matrix_multiply_qs16<true>(_input0, _input1, _output, window, info, _alpha) :
+ vector_matrix_multiply_qs16<false>(_input0, _input1, _output, window, info, _alpha);
+ break;
+ }
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ {
+ multiply_alpha ? vector_matrix_multiply_f16<true>(_input0, _input1, _output, window, info, _alpha) :
+ vector_matrix_multiply_f16<false>(_input0, _input1, _output, window, info, _alpha);
+ break;
+ }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
default:
{
ARM_COMPUTE_ERROR("Data type not supported");
@@ -1150,14 +1594,20 @@
matrix_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);
break;
}
+ case DataType::QS16:
+ {
+ multiply_alpha ? matrix_matrix_multiply_qs16<true>(_input0, _input1, _output, window, _alpha) :
+ matrix_matrix_multiply_qs16<false>(_input0, _input1, _output, window, _alpha);
+ break;
+ }
+#ifdef ARM_COMPUTE_ENABLE_FP16
case DataType::F16:
{
-#ifdef ARM_COMPUTE_ENABLE_FP16
multiply_alpha ? matrix_matrix_multiply_f16<true>(_input0, _input1, _output, window, _alpha) :
matrix_matrix_multiply_f16<false>(_input0, _input1, _output, window, _alpha);
break;
-#endif
}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
default:
{
ARM_COMPUTE_ERROR("Data type not supported");
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index ccf5cb4..7f4ee1e 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -43,7 +43,8 @@
void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16,
+ DataType::F32);
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
TensorShape output_shape{ input->info()->tensor_shape() };
@@ -56,28 +57,33 @@
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
- const float scale_x = num_elems_processed_per_iteration;
+ const int scale_x = num_elems_processed_per_iteration;
_input = input;
_output = output;
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension");
+
AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
update_window_and_padding(win,
AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
INEKernel::configure(win);
}
-void NEGEMMTranspose1xWKernel::run(const Window &window)
+void NEGEMMTranspose1xWKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
index 419f482..048c229 100644
--- a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
@@ -64,8 +64,9 @@
INEKernel::configure(win);
}
-void NEGaussian3x3Kernel::run(const Window &window)
+void NEGaussian3x3Kernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
index f872cc2..b62e281 100644
--- a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
@@ -73,8 +73,9 @@
INEKernel::configure(win);
}
-void NEGaussian5x5HorKernel::run(const Window &window)
+void NEGaussian5x5HorKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
@@ -116,8 +117,8 @@
void NEGaussian5x5VertKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
{
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::S16);
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
_input = input;
_output = output;
@@ -140,8 +141,9 @@
INEKernel::configure(win);
}
-void NEGaussian5x5VertKernel::run(const Window &window)
+void NEGaussian5x5VertKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
index 52d1fbf..d6cb1b6 100644
--- a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
+++ b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
@@ -110,8 +110,9 @@
INEKernel::configure(win);
}
-void NEGaussianPyramidHorKernel::run(const Window &window)
+void NEGaussianPyramidHorKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(window.x().step() % 2);
@@ -215,8 +216,9 @@
INEKernel::configure(win);
}
-void NEGaussianPyramidVertKernel::run(const Window &window)
+void NEGaussianPyramidVertKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(window.x().step() != 16);
diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
index 404ad8a..3fd81be 100644
--- a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
@@ -675,8 +675,9 @@
INEKernel::configure(win);
}
-void NEHOGOrientationBinningKernel::run(const Window &window)
+void NEHOGOrientationBinningKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -768,8 +769,9 @@
INEKernel::configure(win);
}
-void NEHOGBlockNormalizationKernel::run(const Window &window)
+void NEHOGBlockNormalizationKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
index 4af22bc..343b051 100644
--- a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
@@ -92,8 +92,9 @@
INEKernel::configure(win);
}
-void NEHOGDetectorKernel::run(const Window &window)
+void NEHOGDetectorKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_hog_descriptor == nullptr);
@@ -176,7 +177,7 @@
win.idx_class = _idx_class;
win.score = score;
- std::unique_lock<std::mutex> lock(_mutex);
+ std::unique_lock<arm_compute::Mutex> lock(_mutex);
_detection_windows->push_back(win);
lock.unlock();
}
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 585676b..233b2ba 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
@@ -287,8 +287,9 @@
}
template <int32_t block_size>
-void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window)
+void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -360,7 +361,7 @@
INEKernel::configure(win);
}
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
template class arm_compute::NEHarrisScoreKernel<3>;
template class arm_compute::NEHarrisScoreKernel<5>;
@@ -1029,8 +1030,9 @@
}
template <int32_t block_size>
-void NEHarrisScoreKernel<block_size>::run(const Window &window)
+void NEHarrisScoreKernel<block_size>::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEHistogramKernel.cpp b/src/core/NEON/kernels/NEHistogramKernel.cpp
index 9e967ec..6e402ae 100644
--- a/src/core/NEON/kernels/NEHistogramKernel.cpp
+++ b/src/core/NEON/kernels/NEHistogramKernel.cpp
@@ -44,7 +44,7 @@
inline void NEHistogramKernel::merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins)
{
- std::lock_guard<std::mutex> lock(_hist_mtx);
+ std::lock_guard<arm_compute::Mutex> lock(_hist_mtx);
const unsigned int v_end = (bins / 4) * 4;
@@ -66,7 +66,7 @@
{
}
-void NEHistogramKernel::histogram_U8(Window win)
+void NEHistogramKernel::histogram_U8(Window win, const ThreadInfo &info)
{
ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
@@ -74,7 +74,7 @@
const int32_t offset = _output->offset();
const uint32_t offrange = offset + _output->range();
const uint32_t *const w_lut = _window_lut;
- uint32_t *const local_hist = _local_hist + win.thread_id() * bins;
+ uint32_t *const local_hist = _local_hist + info.thread_id * bins;
// Clear local_histogram
std::fill_n(local_hist, bins, 0);
@@ -129,8 +129,9 @@
merge_histogram(_output->buffer(), local_hist, bins);
}
-void NEHistogramKernel::histogram_fixed_U8(Window win)
+void NEHistogramKernel::histogram_fixed_U8(Window win, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
std::array<uint32_t, _max_range_size> local_hist{ { 0 } };
@@ -242,11 +243,11 @@
INEKernel::configure(win);
}
-void NEHistogramKernel::run(const Window &window)
+void NEHistogramKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
- (this->*_func)(window);
+ (this->*_func)(window, info);
}
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index c7c23d5..71910e3 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
@@ -47,7 +48,8 @@
bool has_bias,
int top_left_x,
int top_left_y,
- int kernel_size,
+ int kernel_width,
+ int kernel_height,
int kernel_depth,
int input_w,
int input_h,
@@ -56,9 +58,9 @@
int input_stride_z,
int fixed_point_position)
{
- const int kernel_size2 = kernel_size * kernel_size;
- const int x_e = top_left_x + kernel_size;
- const int y_e = top_left_y + kernel_size;
+ const int kernel_size2 = kernel_width * kernel_height;
+ const int x_e = top_left_x + kernel_width;
+ const int y_e = top_left_y + kernel_height;
// Linearize volume
int d = 0;
@@ -109,8 +111,8 @@
if((y < 0 || y >= input_h) && has_pads)
{
// All the values will be zeros
- memset(out_ptr, 0, kernel_size * sizeof(T));
- out_ptr += kernel_size;
+ memset(out_ptr, 0, kernel_width * sizeof(T));
+ out_ptr += kernel_width;
}
else
{
@@ -132,9 +134,13 @@
// Append 1 if the convolution layer has biases
if(has_bias)
{
- if(std::is_same<T, arm_compute::qint8_t>::value)
+ if(std::is_same<T, qint8_t>::value)
{
- *out_ptr = scvt_qs8_f32(1.0f, fixed_point_position);
+ *out_ptr = sqcvt_qs8_f32(1.0f, fixed_point_position);
+ }
+ else if(std::is_same<T, qint16_t>::value)
+ {
+ *out_ptr = sqcvt_qs16_f32(1.0f, fixed_point_position);
}
else
{
@@ -199,7 +205,8 @@
_has_bias,
top_left_x,
top_left_y,
- static_cast<int>(_kernel_size),
+ static_cast<int>(_kernel_width),
+ static_cast<int>(_kernel_height),
kernel_depth,
input_w,
input_h,
@@ -224,7 +231,7 @@
in_window.set(Window::DimX, Window::Dimension(0, 1, 1));
Window out_window;
- out_window.use_tensor_dimensions(_output->info());
+ out_window.use_tensor_dimensions(_output->info()->tensor_shape());
out_window.set(Window::DimX, Window::Dimension(out_window.x().start(), out_window.x().end(), in_width));
Window in_slice = in_window.first_slice_window_3D();
@@ -246,9 +253,13 @@
// Add bias
if(_has_bias)
{
- if(std::is_same<T, arm_compute::qint8_t>::value)
+ if(std::is_same<T, qint8_t>::value)
{
- *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = scvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
+ *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
+ }
+ else if(std::is_same<T, qint16_t>::value)
+ {
+ *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs16_f32(1.0f, _input->info()->fixed_point_position());
}
else
{
@@ -260,24 +271,30 @@
}
NEIm2ColKernel::NEIm2ColKernel()
- : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_size(0), _has_bias(false)
+ : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false)
{
}
-void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias)
+void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QS8, DataType::QS16);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
_input = input;
_output = output;
- _convolved_dims = convolved_dims;
_conv_info = conv_info;
- _kernel_size = std::sqrt((output->info()->dimension(0) - (has_bias ? 1 : 0)) / input->info()->dimension(2));
- _has_bias = has_bias;
+ _kernel_width = kernel_dims.width;
+ _kernel_height = kernel_dims.height,
+ _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
+ _kernel_width, _kernel_height,
+ _conv_info);
+ _has_bias = has_bias;
- unsigned int pad_x, pad_y, stride_x, stride_y = 0;
+ unsigned int pad_x = 0;
+ unsigned int pad_y = 0;
+ unsigned int stride_x = 0;
+ unsigned int stride_y = 0;
std::tie(pad_x, pad_y) = conv_info.pad();
std::tie(stride_x, stride_y) = conv_info.stride();
@@ -296,9 +313,17 @@
case DataType::F32:
_func = &NEIm2ColKernel::run_reduced<float>;
break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ _func = &NEIm2ColKernel::run_reduced<float16_t>;
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
case DataType::QS8:
_func = &NEIm2ColKernel::run_reduced<qint8_t>;
break;
+ case DataType::QS16:
+ _func = &NEIm2ColKernel::run_reduced<qint16_t>;
+ break;
default:
ARM_COMPUTE_ERROR("Data type not supported");
break;
@@ -311,9 +336,17 @@
case DataType::F32:
_func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<float, false> : &NEIm2ColKernel::run_generic<float, true>;
break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<float16_t, false> : &NEIm2ColKernel::run_generic<float16_t, true>;
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
case DataType::QS8:
_func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<qint8_t, false> : &NEIm2ColKernel::run_generic<qint8_t, true>;
break;
+ case DataType::QS16:
+ _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<qint16_t, false> : &NEIm2ColKernel::run_generic<qint16_t, true>;
+ break;
default:
ARM_COMPUTE_ERROR("Data type not supported");
break;
@@ -329,8 +362,9 @@
IKernel::configure(window);
}
-void NEIm2ColKernel::run(const Window &window)
+void NEIm2ColKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.cpp b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
index 3b09a1b..16a3cf7 100644
--- a/src/core/NEON/kernels/NEIntegralImageKernel.cpp
+++ b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
@@ -71,8 +71,9 @@
return false;
}
-void NEIntegralImageKernel::run(const Window &window)
+void NEIntegralImageKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEL2NormalizeKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeKernel.cpp
new file mode 100644
index 0000000..12c532a
--- /dev/null
+++ b/src/core/NEON/kernels/NEL2NormalizeKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cmath>
+
+using namespace arm_compute;
+
+namespace
+{
+void l2_normalize_X(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
+{
+ Window window_sum(window);
+ window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ Window in_slice = window.first_slice_window_1D();
+ Window sum_slice = window_sum.first_slice_window_1D();
+
+ do
+ {
+ Iterator input_it(in, in_slice);
+ Iterator sum_it(sum, sum_slice);
+ Iterator output_it(out, in_slice);
+
+ const float sum_value = *reinterpret_cast<const float *>(sum_it.ptr());
+ const float32x4_t vec_normalize_value = vdupq_n_f32(1.f / std::sqrt(std::max(sum_value, epsilon)));
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const float *>(input_it.ptr());
+ const auto out_ptr = reinterpret_cast<float *>(output_it.ptr());
+
+ vst1q_f32(out_ptr, vmulq_f32(vld1q_f32(in_ptr), vec_normalize_value));
+ },
+ input_it, output_it);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
+} // namespace
+
+NEL2NormalizeKernel::NEL2NormalizeKernel()
+ : _input(nullptr), _sum(nullptr), _output(nullptr), _axis(0), _epsilon(1e-12)
+{
+}
+
+void NEL2NormalizeKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, unsigned int axis, float epsilon)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
+ ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Normalization axis greater than max number of dimensions");
+ ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported normalization axis, Supported axis is 0");
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+ unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
+ unsigned int num_elems_processed_per_iteration_sum = (axis == 0) ? 1 : num_elems_processed_per_iteration;
+
+ _input = input;
+ _sum = sum;
+ _output = output;
+ _axis = axis;
+ _epsilon = epsilon;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal sum_access(sum->info(), 0, num_elems_processed_per_iteration_sum);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, sum_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ INEKernel::configure(win);
+}
+
+void NEL2NormalizeKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ switch(_axis)
+ {
+ case 0:
+ l2_normalize_X(_input, _sum, _output, _epsilon, window);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported normalization axis");
+ }
+}
diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp
index 3d2bfb2..6fac797 100644
--- a/src/core/NEON/kernels/NELKTrackerKernel.cpp
+++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp
@@ -385,8 +385,9 @@
INEKernel::configure(window);
}
-void NELKTrackerKernel::run(const Window &window)
+void NELKTrackerKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
index ab84efb..1b2942c 100644
--- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
@@ -49,15 +49,136 @@
namespace
{
-void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window)
+void vector_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));
+ const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+ const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
+
+ // The implementation computes 16 elements per iteration
+ const int window_start_x = 16 * info.thread_id;
+ const int window_step_x = 16 * info.num_threads;
+ // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+ const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+ Window win_out(window);
+ win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+
+ Window win_a(window);
+ win_a.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator ina(input0, win_a);
+ Iterator out(output, win_out);
+
+ execute_window_loop(win_out, [&](const Coordinates & id)
+ {
+ if(id.x() > width_matrix_b)
+ {
+ return;
+ }
+
+ float16x8_t acc0 = vdupq_n_f16(0.f);
+ float16x8_t acc1 = vdupq_n_f16(0.f);
+ float16x8_t acc2 = vdupq_n_f16(0.f);
+ float16x8_t acc3 = vdupq_n_f16(0.f);
+
+ auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr());
+ auto matrix_b = reinterpret_cast<const float16_t *>(input1->ptr_to_element(Coordinates(id[0], 0, id[1])));
+
+ const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+
+ for(; vec_a <= (vec_a_end_addr - 4);)
+ {
+ const float16x4_t a0l = vld1_f16(vec_a);
+
+ float16x8_t b00 = vld1q_f16(matrix_b);
+ float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+ float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+ float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+
+ float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+ float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+ float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+ float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+ acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
+ acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
+ acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
+ acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
+ acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
+ acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
+ acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
+ acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
+
+ matrix_b += 2 * in_b_stride;
+
+ b00 = vld1q_f16(matrix_b);
+ b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+ b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+ b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+ b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+ b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+ b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+ b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+ acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
+ acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
+ acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
+ acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
+ acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
+ acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
+ acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
+ acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
+
+ vec_a += 4;
+ matrix_b += 2 * in_b_stride;
+ }
+
+ for(; vec_a < vec_a_end_addr;)
+ {
+ const float16_t a0 = *vec_a;
+ const float16x8_t b00 = vld1q_f16(matrix_b);
+ const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+ const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+ const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+
+ acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
+ acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
+ acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
+ acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
+
+ vec_a += 1;
+ matrix_b += in_b_stride;
+ }
+
+ const auto vec_out = reinterpret_cast<float16_t *>(out.ptr());
+
+ vst1q_f16(vec_out + 0, acc0);
+ vst1q_f16(vec_out + 8, acc1);
+ vst1q_f16(vec_out + 16, acc2);
+ vst1q_f16(vec_out + 24, acc3);
+ },
+ ina, out);
+#else /* ARM_COMPUTE_ENABLE_FP16 */
+ ARM_COMPUTE_UNUSED(input0);
+ ARM_COMPUTE_UNUSED(input1);
+ ARM_COMPUTE_UNUSED(output);
+ ARM_COMPUTE_UNUSED(window);
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR("Not supported, recompile with -march=armv8.2-a+fp16+simd.");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
+void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info)
{
const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));
const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
// The implementation computes 16 elements per iteration
- const int window_start_x = 16 * window.thread_id();
- const int window_step_x = 16 * window.num_threads();
+ const int window_start_x = 16 * info.thread_id;
+ const int window_step_x = 16 * info.num_threads;
// Make sure (window_end_x - window_start_x) is a multiple of window_step_x
const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
@@ -89,7 +210,7 @@
asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
-#endif
+#endif /* __arm__ */
const float *vec_a_end_addr = vec_a + num_elems_vec_a;
@@ -113,7 +234,7 @@
asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
-#endif
+#endif /* __arm __ */
acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
@@ -190,17 +311,17 @@
void NELocallyConnectedMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
_input0 = input0;
_input1 = input1;
_output = output;
- unsigned int num_elems_processed_per_iteration_x = 16;
+ const unsigned int num_elems_processed_per_iteration_x = 16;
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
@@ -217,10 +338,27 @@
INEKernel::configure(win);
}
-void NELocallyConnectedMatrixMultiplyKernel::run(const Window &window)
+void NELocallyConnectedMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- vector_matrix_multiply_f32(_input0, _input1, _output, window);
+ switch(_input0->info()->data_type())
+ {
+ case DataType::F16:
+ {
+ vector_matrix_multiply_f16(_input0, _input1, _output, window, info);
+ break;
+ }
+ case DataType::F32:
+ {
+ vector_matrix_multiply_f32(_input0, _input1, _output, window, info);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
+ }
}
diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
index a874d21..433985f 100644
--- a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
+++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
@@ -415,8 +415,9 @@
}
template <MagnitudeType mag_type, PhaseType phase_type>
-void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::run(const Window &window)
+void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -428,7 +429,7 @@
template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
namespace
{
@@ -854,8 +855,9 @@
}
template <MagnitudeType mag_type, PhaseType phase_type>
-void NEMagnitudePhaseKernel<mag_type, phase_type>::run(const Window &window)
+void NEMagnitudePhaseKernel<mag_type, phase_type>::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
index 4616203..7895b00 100644
--- a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
@@ -85,10 +85,15 @@
} // namespace
NEMeanStdDevKernel::NEMeanStdDevKernel()
- : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _mtx()
+ : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _mtx(), _border_size(0)
{
}
+BorderSize NEMeanStdDevKernel::border_size() const
+{
+ return _border_size;
+}
+
void NEMeanStdDevKernel::configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev, uint64_t *global_sum_squared)
{
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
@@ -105,6 +110,8 @@
constexpr unsigned int num_elems_processed_per_iteration = 16;
+ _border_size = BorderSize(ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration) - input->info()->dimension(0));
+
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
@@ -113,8 +120,9 @@
INEKernel::configure(win);
}
-void NEMeanStdDevKernel::run(const Window &window)
+void NEMeanStdDevKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Iterator input(_input, window);
@@ -134,7 +142,7 @@
const float num_pixels = _input->info()->dimension(0) * _input->info()->dimension(1);
// Merge sum and calculate mean and stddev
- std::unique_lock<std::mutex> lock(_mtx);
+ std::unique_lock<arm_compute::Mutex> lock(_mtx);
*_global_sum += vget_lane_u64(local_sum, 0);
diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
index 601a0e1..54ef33e 100644
--- a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
@@ -75,8 +75,9 @@
INEKernel::configure(win);
}
-void NEMedian3x3Kernel::run(const Window &window)
+void NEMedian3x3Kernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
new file mode 100644
index 0000000..a81725f
--- /dev/null
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <climits>
+#include <cstddef>
+
+namespace arm_compute
+{
+NEMinMaxLayerKernel::NEMinMaxLayerKernel()
+ : _input(nullptr), _output(nullptr), _mtx()
+{
+}
+
+void NEMinMaxLayerKernel::configure(const ITensor *input, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ output_shape.set(Window::DimX, 2);
+ output_shape.remove_dimension(1);
+ output_shape.remove_dimension(1);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+ _input = input;
+ _output = output;
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, 2);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ INEKernel::configure(win);
+}
+
+void NEMinMaxLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ const int x_start = window.x().start();
+ const int x_end = window.x().end();
+
+ Window window_output;
+ window_output.use_tensor_dimensions(_output->info()->tensor_shape());
+ window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ // Handle X dimension manually to split into two loops
+ // First one will use vector operations, second one processes the left over pixels
+ Window window_input(window);
+ window_input.set(Window::DimX, Window::Dimension(0, 1, 1));
+ window_input.collapse_if_possible(INEKernel::window(), 3);
+ window_input.set(3, Window::Dimension(0, 1, 1));
+
+ Iterator input(_input, window_input);
+ Iterator output(_output, window_output);
+
+ execute_window_loop(window_output, [&](const Coordinates & id_batch)
+ {
+ float32x2_t carry_min = vdup_n_f32(std::numeric_limits<float>::max());
+ float32x2_t carry_max = vdup_n_f32(std::numeric_limits<float>::lowest());
+
+ float carry_min_scalar = std::numeric_limits<float>::max();
+ float carry_max_scalar = std::numeric_limits<float>::lowest();
+
+ execute_window_loop(window_input, [&](const Coordinates & id)
+ {
+ int x = x_start;
+ const auto in_ptr = reinterpret_cast<const float *const>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
+
+ // Vector loop
+ for(; x <= x_end - 8; x += 8)
+ {
+ const float32x4x2_t pixels = vld2q_f32(in_ptr + x);
+ const float32x4_t tmp_min1 = vminq_f32(pixels.val[0], pixels.val[1]);
+ const float32x4_t tmp_max1 = vmaxq_f32(pixels.val[0], pixels.val[1]);
+ const float32x2_t tmp_min2 = vmin_f32(vget_high_f32(tmp_min1), vget_low_f32(tmp_min1));
+ const float32x2_t tmp_max2 = vmax_f32(vget_high_f32(tmp_max1), vget_low_f32(tmp_max1));
+ carry_min = vmin_f32(tmp_min2, carry_min);
+ carry_max = vmax_f32(tmp_max2, carry_max);
+ }
+
+ // Process leftover pixels
+ for(; x < x_end; ++x)
+ {
+ const float pixel = in_ptr[x];
+ carry_min_scalar = std::min(pixel, carry_min_scalar);
+ carry_max_scalar = std::max(pixel, carry_max_scalar);
+ }
+ },
+ input);
+
+ // Reduce result
+ carry_min = vpmin_f32(carry_min, carry_min);
+ carry_max = vpmax_f32(carry_max, carry_max);
+ carry_min = vpmin_f32(carry_min, carry_min);
+ carry_max = vpmax_f32(carry_max, carry_max);
+
+ // Extract max/min values
+ const float min_i = std::min(vget_lane_f32(carry_min, 0), carry_min_scalar);
+ const float max_i = std::max(vget_lane_f32(carry_max, 0), carry_max_scalar);
+
+ auto out_ptr = reinterpret_cast<float *const>(output.ptr());
+
+ // Perform reduction of local min/max values
+ update_min_max(out_ptr, min_i, max_i);
+ },
+ output);
+}
+
+void NEMinMaxLayerKernel::reset()
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+
+ float32x2_t reset_values = vdup_n_f32(0.0f);
+ reset_values = vset_lane_f32(std::numeric_limits<float>::max(), reset_values, 0);
+ reset_values = vset_lane_f32(std::numeric_limits<float>::min(), reset_values, 1);
+
+ Window window_output;
+ window_output.use_tensor_dimensions(_output->info()->tensor_shape());
+ window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator output(_output, window_output);
+
+ execute_window_loop(window_output, [&](const Coordinates & id)
+ {
+ vst1_f32(reinterpret_cast<float *const>(output.ptr()), reset_values);
+ },
+ output);
+}
+
+void NEMinMaxLayerKernel::update_min_max(float *out_ptr, float min, float max)
+{
+ std::lock_guard<Mutex> lock(_mtx);
+
+ const float32x2_t old_min = vld1_dup_f32(out_ptr);
+ const float32x2_t old_max = vld1_dup_f32(out_ptr + 1);
+ const float32x2_t new_min = vmin_f32(vdup_n_f32(min), old_min);
+ const float32x2_t new_max = vmax_f32(vdup_n_f32(max), old_max);
+
+ vst1_f32(out_ptr, vzip_f32(new_min, new_max).val[0]);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
index b188614..c7dc03c 100644
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
@@ -31,7 +31,9 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include <algorithm>
#include <arm_neon.h>
#include <climits>
#include <cstddef>
@@ -39,14 +41,14 @@
namespace arm_compute
{
NEMinMaxKernel::NEMinMaxKernel()
- : _func(), _input(nullptr), _min(), _max(), _min_init(), _max_init(), _mtx()
+ : _func(), _input(nullptr), _min(), _max(), _mtx()
{
}
-void NEMinMaxKernel::configure(const IImage *input, int32_t *min, int32_t *max)
+void NEMinMaxKernel::configure(const IImage *input, void *min, void *max)
{
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
ARM_COMPUTE_ERROR_ON(nullptr == min);
ARM_COMPUTE_ERROR_ON(nullptr == max);
@@ -54,35 +56,33 @@
_min = min;
_max = max;
- switch(input->info()->format())
+ switch(_input->info()->data_type())
{
- case Format::U8:
- _min_init = UCHAR_MAX;
- _max_init = 0;
- _func = &NEMinMaxKernel::minmax_U8;
+ case DataType::U8:
+ _func = &NEMinMaxKernel::minmax_U8;
break;
- case Format::S16:
- _min_init = SHRT_MAX;
- _max_init = SHRT_MIN;
- _func = &NEMinMaxKernel::minmax_S16;
+ case DataType::S16:
+ _func = &NEMinMaxKernel::minmax_S16;
+ break;
+ case DataType::F32:
+ _func = &NEMinMaxKernel::minmax_F32;
break;
default:
- ARM_COMPUTE_ERROR("You called with the wrong img formats");
+ ARM_COMPUTE_ERROR("Unsupported data type");
break;
}
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
- update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
INEKernel::configure(win);
}
-void NEMinMaxKernel::run(const Window &window)
+void NEMinMaxKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -93,40 +93,85 @@
void NEMinMaxKernel::reset()
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- *_min = _min_init;
- *_max = _max_init;
+ switch(_input->info()->data_type())
+ {
+ case DataType::U8:
+ *static_cast<int32_t *>(_min) = UCHAR_MAX;
+ *static_cast<int32_t *>(_max) = 0;
+ break;
+ case DataType::S16:
+ *static_cast<int32_t *>(_min) = SHRT_MAX;
+ *static_cast<int32_t *>(_max) = SHRT_MIN;
+ break;
+ case DataType::F32:
+ *static_cast<float *>(_min) = std::numeric_limits<float>::max();
+ *static_cast<float *>(_max) = std::numeric_limits<float>::lowest();
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type");
+ break;
+ }
}
template <typename T>
void NEMinMaxKernel::update_min_max(const T min, const T max)
{
- std::lock_guard<std::mutex> lock(_mtx);
+ std::lock_guard<arm_compute::Mutex> lock(_mtx);
- if(min < *_min)
+ using type = typename std::conditional<std::is_same<T, float>::value, float, int32_t>::type;
+
+ auto min_ptr = static_cast<type *>(_min);
+ auto max_ptr = static_cast<type *>(_max);
+
+ if(min < *min_ptr)
{
- *_min = min;
+ *min_ptr = min;
}
- if(max > *_max)
+ if(max > *max_ptr)
{
- *_max = max;
+ *max_ptr = max;
}
}
-void NEMinMaxKernel::minmax_U8(const Window &win)
+void NEMinMaxKernel::minmax_U8(Window win)
{
uint8x8_t carry_min = vdup_n_u8(UCHAR_MAX);
uint8x8_t carry_max = vdup_n_u8(0);
+ uint8_t carry_max_scalar = 0;
+ uint8_t carry_min_scalar = UCHAR_MAX;
+
+ const int x_start = win.x().start();
+ const int x_end = win.x().end();
+
+ // Handle X dimension manually to split into two loops
+ // First one will use vector operations, second one processes the left over pixels
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
Iterator input(_input, win);
execute_window_loop(win, [&](const Coordinates & id)
{
- const uint8x16_t pixels = vld1q_u8(input.ptr());
- const uint8x8_t tmp_min = vmin_u8(vget_high_u8(pixels), vget_low_u8(pixels));
- const uint8x8_t tmp_max = vmax_u8(vget_high_u8(pixels), vget_low_u8(pixels));
- carry_min = vmin_u8(tmp_min, carry_min);
- carry_max = vmax_u8(tmp_max, carry_max);
+ int x = x_start;
+
+ // Vector loop
+ for(; x <= x_end - 16; x += 16)
+ {
+ const uint8x16_t pixels = vld1q_u8(input.ptr() + x);
+ const uint8x8_t tmp_min = vmin_u8(vget_high_u8(pixels), vget_low_u8(pixels));
+ const uint8x8_t tmp_max = vmax_u8(vget_high_u8(pixels), vget_low_u8(pixels));
+ carry_min = vmin_u8(tmp_min, carry_min);
+ carry_max = vmax_u8(tmp_max, carry_max);
+ }
+
+ // Process leftover pixels
+ for(; x < x_end; ++x)
+ {
+ const uint8_t pixel = input.ptr()[x];
+ carry_min_scalar = std::min(pixel, carry_min_scalar);
+ carry_max_scalar = std::max(pixel, carry_max_scalar);
+ }
},
input);
@@ -139,30 +184,55 @@
carry_max = vpmax_u8(carry_max, carry_max);
// Extract max/min values
- const uint8_t min_i = vget_lane_u8(carry_min, 0);
- const uint8_t max_i = vget_lane_u8(carry_max, 0);
+ const uint8_t min_i = std::min(vget_lane_u8(carry_min, 0), carry_min_scalar);
+ const uint8_t max_i = std::max(vget_lane_u8(carry_max, 0), carry_max_scalar);
// Perform reduction of local min/max values
update_min_max(min_i, max_i);
}
-void NEMinMaxKernel::minmax_S16(const Window &win)
+void NEMinMaxKernel::minmax_S16(Window win)
{
int16x4_t carry_min = vdup_n_s16(SHRT_MAX);
int16x4_t carry_max = vdup_n_s16(SHRT_MIN);
+ int16_t carry_max_scalar = SHRT_MIN;
+ int16_t carry_min_scalar = SHRT_MAX;
+
+ const int x_start = win.x().start();
+ const int x_end = win.x().end();
+
+ // Handle X dimension manually to split into two loops
+ // First one will use vector operations, second one processes the left over pixels
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
Iterator input(_input, win);
execute_window_loop(win, [&](const Coordinates & id)
{
- const auto in_ptr = reinterpret_cast<const int16_t *>(input.ptr());
- const int16x8x2_t pixels = vld2q_s16(in_ptr);
- const int16x8_t tmp_min1 = vminq_s16(pixels.val[0], pixels.val[1]);
- const int16x8_t tmp_max1 = vmaxq_s16(pixels.val[0], pixels.val[1]);
- const int16x4_t tmp_min2 = vmin_s16(vget_high_s16(tmp_min1), vget_low_s16(tmp_min1));
- const int16x4_t tmp_max2 = vmax_s16(vget_high_s16(tmp_max1), vget_low_s16(tmp_max1));
- carry_min = vmin_s16(tmp_min2, carry_min);
- carry_max = vmax_s16(tmp_max2, carry_max);
+ int x = x_start;
+ const auto in_ptr = reinterpret_cast<const int16_t *const>(input.ptr());
+
+ // Vector loop
+ for(; x <= x_end - 16; x += 16)
+ {
+ const int16x8x2_t pixels = vld2q_s16(in_ptr + x);
+ const int16x8_t tmp_min1 = vminq_s16(pixels.val[0], pixels.val[1]);
+ const int16x8_t tmp_max1 = vmaxq_s16(pixels.val[0], pixels.val[1]);
+ const int16x4_t tmp_min2 = vmin_s16(vget_high_s16(tmp_min1), vget_low_s16(tmp_min1));
+ const int16x4_t tmp_max2 = vmax_s16(vget_high_s16(tmp_max1), vget_low_s16(tmp_max1));
+ carry_min = vmin_s16(tmp_min2, carry_min);
+ carry_max = vmax_s16(tmp_max2, carry_max);
+ }
+
+ // Process leftover pixels
+ for(; x < x_end; ++x)
+ {
+ const int16_t pixel = in_ptr[x];
+ carry_min_scalar = std::min(pixel, carry_min_scalar);
+ carry_max_scalar = std::max(pixel, carry_max_scalar);
+ }
+
},
input);
@@ -173,15 +243,74 @@
carry_max = vpmax_s16(carry_max, carry_max);
// Extract max/min values
- const int16_t min_i = vget_lane_s16(carry_min, 0);
- const int16_t max_i = vget_lane_s16(carry_max, 0);
+ const int16_t min_i = std::min(vget_lane_s16(carry_min, 0), carry_min_scalar);
+ const int16_t max_i = std::max(vget_lane_s16(carry_max, 0), carry_max_scalar);
+
+ // Perform reduction of local min/max values
+ update_min_max(min_i, max_i);
+}
+
+void NEMinMaxKernel::minmax_F32(Window win)
+{
+ float32x2_t carry_min = vdup_n_f32(std::numeric_limits<float>::max());
+ float32x2_t carry_max = vdup_n_f32(std::numeric_limits<float>::lowest());
+
+ float carry_min_scalar = std::numeric_limits<float>::max();
+ float carry_max_scalar = std::numeric_limits<float>::lowest();
+
+ const int x_start = win.x().start();
+ const int x_end = win.x().end();
+
+ // Handle X dimension manually to split into two loops
+ // First one will use vector operations, second one processes the left over pixels
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(_input, win);
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ int x = x_start;
+ const auto in_ptr = reinterpret_cast<const float *const>(input.ptr());
+
+ // Vector loop
+ for(; x <= x_end - 8; x += 8)
+ {
+ const float32x4x2_t pixels = vld2q_f32(in_ptr + x);
+ const float32x4_t tmp_min1 = vminq_f32(pixels.val[0], pixels.val[1]);
+ const float32x4_t tmp_max1 = vmaxq_f32(pixels.val[0], pixels.val[1]);
+ const float32x2_t tmp_min2 = vmin_f32(vget_high_f32(tmp_min1), vget_low_f32(tmp_min1));
+ const float32x2_t tmp_max2 = vmax_f32(vget_high_f32(tmp_max1), vget_low_f32(tmp_max1));
+ carry_min = vmin_f32(tmp_min2, carry_min);
+ carry_max = vmax_f32(tmp_max2, carry_max);
+ }
+
+ // Process leftover pixels
+ for(; x < x_end; ++x)
+ {
+ const float pixel = in_ptr[x];
+ carry_min_scalar = std::min(pixel, carry_min_scalar);
+ carry_max_scalar = std::max(pixel, carry_max_scalar);
+ }
+
+ },
+ input);
+
+ // Reduce result
+ carry_min = vpmin_f32(carry_min, carry_min);
+ carry_max = vpmax_f32(carry_max, carry_max);
+ carry_min = vpmin_f32(carry_min, carry_min);
+ carry_max = vpmax_f32(carry_max, carry_max);
+
+ // Extract max/min values
+ const float min_i = std::min(vget_lane_f32(carry_min, 0), carry_min_scalar);
+ const float max_i = std::max(vget_lane_f32(carry_max, 0), carry_max_scalar);
// Perform reduction of local min/max values
update_min_max(min_i, max_i);
}
NEMinMaxLocationKernel::NEMinMaxLocationKernel()
- : _func(nullptr), _input(nullptr), _min(nullptr), _max(nullptr), _min_count(nullptr), _max_count(nullptr), _min_loc(nullptr), _max_loc(nullptr), _num_elems_processed_per_iteration(0)
+ : _func(nullptr), _input(nullptr), _min(nullptr), _max(nullptr), _min_count(nullptr), _max_count(nullptr), _min_loc(nullptr), _max_loc(nullptr)
{
}
@@ -222,12 +351,12 @@
&NEMinMaxLocationKernel::minmax_loc<T, bool(N & 8), bool(N & 4), bool(N & 2), bool(N & 1)>...
};
-void NEMinMaxLocationKernel::configure(const IImage *input, int32_t *min, int32_t *max,
+void NEMinMaxLocationKernel::configure(const IImage *input, void *min, void *max,
ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc,
uint32_t *min_count, uint32_t *max_count)
{
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
- ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::U8, Format::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
ARM_COMPUTE_ERROR_ON(nullptr == min);
ARM_COMPUTE_ERROR_ON(nullptr == max);
@@ -246,31 +375,35 @@
unsigned int table_idx = (count_min << 3) | (count_max << 2) | (loc_min << 1) | loc_max;
- switch(input->info()->format())
+ switch(input->info()->data_type())
{
- case Format::U8:
+ case DataType::U8:
_func = create_func_table<uint8_t, gen_index_seq<16>::type>::func_table[table_idx];
break;
- case Format::S16:
+ case DataType::S16:
_func = create_func_table<int16_t, gen_index_seq<16>::type>::func_table[table_idx];
break;
+ case DataType::F32:
+ _func = create_func_table<float, gen_index_seq<16>::type>::func_table[table_idx];
+ break;
default:
- ARM_COMPUTE_ERROR("You called with the wrong img formats");
+ ARM_COMPUTE_ERROR("Unsupported data type");
break;
}
- _num_elems_processed_per_iteration = 16;
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, _num_elems_processed_per_iteration));
+ update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
INEKernel::configure(win);
}
-void NEMinMaxLocationKernel::run(const Window &window)
+void NEMinMaxLocationKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -285,9 +418,8 @@
{
Iterator input(_input, win);
- size_t min_count = 0;
- size_t max_count = 0;
- unsigned int step = _num_elems_processed_per_iteration;
+ size_t min_count = 0;
+ size_t max_count = 0;
// Clear min location array
if(loc_min)
@@ -301,46 +433,48 @@
_max_loc->clear();
}
+ using type = typename std::conditional<std::is_same<T, float>::value, float, int32_t>::type;
+
+ auto min_ptr = static_cast<type *>(_min);
+ auto max_ptr = static_cast<type *>(_max);
+
execute_window_loop(win, [&](const Coordinates & id)
{
auto in_ptr = reinterpret_cast<const T *>(input.ptr());
int32_t idx = id.x();
int32_t idy = id.y();
- for(unsigned int i = 0; i < step; ++i)
+ const T pixel = *in_ptr;
+ Coordinates2D p{ idx, idy };
+
+ if(count_min || loc_min)
{
- const T pixel = *in_ptr++;
- Coordinates2D p{ idx++, idy };
-
- if(count_min || loc_min)
+ if(*min_ptr == pixel)
{
- if(*_min == pixel)
+ if(count_min)
{
- if(count_min)
- {
- ++min_count;
- }
+ ++min_count;
+ }
- if(loc_min)
- {
- _min_loc->push_back(p);
- }
+ if(loc_min)
+ {
+ _min_loc->push_back(p);
}
}
+ }
- if(count_max || loc_max)
+ if(count_max || loc_max)
+ {
+ if(*max_ptr == pixel)
{
- if(*_max == pixel)
+ if(count_max)
{
- if(count_max)
- {
- ++max_count;
- }
+ ++max_count;
+ }
- if(loc_max)
- {
- _max_loc->push_back(p);
- }
+ if(loc_max)
+ {
+ _max_loc->push_back(p);
}
}
}
diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
index 03d1409..ba68de6 100644
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
@@ -930,8 +930,9 @@
input, output);
}
-void NENonLinearFilterKernel::run(const Window &window)
+void NENonLinearFilterKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
index 1826c47..b7dfb59 100644
--- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
@@ -224,7 +224,7 @@
INEKernel::configure(win);
}
-#endif
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
namespace
{
@@ -495,8 +495,9 @@
INEKernel::configure(win);
}
-void NENonMaximaSuppression3x3Kernel::run(const Window &window)
+void NENonMaximaSuppression3x3Kernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index a971dc8..fc3f5f2 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -46,15 +46,20 @@
void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, input_squared, output);
ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
- ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
- ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
- ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
+ if(is_data_type_fixed_point(input->info()->data_type()))
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared, output);
+ ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
+ ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
+ ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
+ }
const unsigned int border_width = (norm_info.type() == NormType::CROSS_MAP) ? 0 : std::min(norm_info.norm_size() / 2, 3U);
@@ -64,27 +69,101 @@
_norm_info = norm_info;
_border_size = BorderSize(0, border_width);
- const bool is_dt_f32 = _input->info()->data_type() == DataType::F32;
+ unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+ ARM_COMPUTE_UNUSED(num_elems_processed_per_iteration);
- switch(norm_info.type())
+ switch(_input->info()->data_type())
{
- case NormType::IN_MAP_1D:
- _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, false> : &NENormalizationLayerKernel::normalize_fixed_point<0, false>;
+ case DataType::F32:
+ {
+ num_elems_processed_per_iteration = 4;
+ switch(norm_info.type())
+ {
+ case NormType::IN_MAP_1D:
+ _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 0, false>;
+ break;
+ case NormType::IN_MAP_2D:
+ // Normalize over X and Y
+ _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 0, true>;
+ break;
+ case NormType::CROSS_MAP:
+ _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 2, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
break;
- case NormType::IN_MAP_2D:
- // Normalize over X and Y
- _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, true> : &NENormalizationLayerKernel::normalize_fixed_point<0, true>;
+ }
+ case DataType::F16:
+ {
+ num_elems_processed_per_iteration = 8;
+ switch(norm_info.type())
+ {
+ case NormType::IN_MAP_1D:
+ _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 0, false>;
+ break;
+ case NormType::IN_MAP_2D:
+ // Normalize over X and Y
+ _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 0, true>;
+ break;
+ case NormType::CROSS_MAP:
+ _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 2, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
break;
- case NormType::CROSS_MAP:
- _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<2, false> : &NENormalizationLayerKernel::normalize_fixed_point<2, false>;
+ }
+ case DataType::QS8:
+ {
+ num_elems_processed_per_iteration = 16;
+ switch(norm_info.type())
+ {
+ case NormType::IN_MAP_1D:
+ _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 0, false>;
+ break;
+ case NormType::IN_MAP_2D:
+ // Normalize over X and Y
+ _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 0, true>;
+ break;
+ case NormType::CROSS_MAP:
+ _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 2, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
break;
+ }
+ case DataType::QS16:
+ {
+ num_elems_processed_per_iteration = 8;
+ switch(norm_info.type())
+ {
+ case NormType::IN_MAP_1D:
+ _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 0, false>;
+ break;
+ case NormType::IN_MAP_2D:
+ // Normalize over X and Y
+ _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 0, true>;
+ break;
+ case NormType::CROSS_MAP:
+ _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 2, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
+ break;
+ }
default:
ARM_COMPUTE_ERROR("NOT SUPPORTED!");
}
- const unsigned int num_elems_processed_per_iteration = (is_dt_f32) ? 4 : 16;
- const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
- const unsigned int num_rows = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
+ const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+ const unsigned int num_rows = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
// Configure window
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
@@ -100,8 +179,8 @@
INEKernel::configure(win);
}
-template <unsigned int dim, bool do_2D_norm>
-void NENormalizationLayerKernel::normalize(const Window &window)
+template <DataType dt, unsigned int dim, bool do_2D_norm>
+void NENormalizationLayerKernel::normalize_float(const Window &window)
{
Iterator input(_input, window);
Iterator input_squared(_input_squared, window);
@@ -117,42 +196,86 @@
const int min_top = 0;
const int max_bottom = _input->info()->dimension(dim_y) - 1;
- const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
- const float32x4_t beta_vec = vdupq_n_f32(_norm_info.beta());
- const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa());
-
- execute_window_loop(window, [&](const Coordinates & id)
+ if(dt == DataType::F32)
{
- // Get range to normalize
- const int current_row = do_2D_norm ? id[dim_y] : 0;
- const int current_slice = id[dim];
- const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
- const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
- const int first_slice = std::max(current_slice - radius, min_left);
- const int last_slice = std::min(current_slice + radius, max_right);
+ const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
+ const float32x4_t beta_vec = vdupq_n_f32(_norm_info.beta());
+ const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa());
- // Accumulate 2D In-Map values
- float32x4_t accu = vdupq_n_f32(0.f);
- for(int j = first_row; j <= last_row; j++)
+ execute_window_loop(window, [&](const Coordinates & id)
{
- // Compute row displacement
- const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
- const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
- for(int i = first_slice; i <= last_slice; ++i)
- {
- accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(input_squared_ptr + i * input_squared_stride)));
- }
- }
+ // Get range to normalize
+ const int current_row = do_2D_norm ? id[dim_y] : 0;
+ const int current_slice = id[dim];
+ const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+ const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+ const int first_slice = std::max(current_slice - radius, min_left);
+ const int last_slice = std::min(current_slice + radius, max_right);
- // Normalize
- const float32x4_t normalized = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
- const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
- vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
- },
- input, input_squared, output);
+ // Accumulate 2D In-Map values
+ float32x4_t accu = vdupq_n_f32(0.f);
+ for(int j = first_row; j <= last_row; j++)
+ {
+ // Compute row displacement
+ const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+ const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+ for(int i = first_slice; i <= last_slice; ++i)
+ {
+ accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(input_squared_ptr + i * input_squared_stride)));
+ }
+ }
+
+ // Normalize
+ const float32x4_t normalized = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
+ const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
+ },
+ input, input_squared, output);
+ }
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ else if(dt == DataType::F16)
+ {
+ const float16x8_t coeff_vec = vdupq_n_f16(_norm_info.scale_coeff());
+ const float16x8_t beta_vec_f16 = vdupq_n_f16(_norm_info.beta());
+ const float16x8_t kappa_vec = vdupq_n_f16(_norm_info.kappa());
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get range to normalize
+ const int current_row = do_2D_norm ? id[dim_y] : 0;
+ const int current_slice = id[dim];
+ const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+ const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+ const int first_slice = std::max(current_slice - radius, min_left);
+ const int last_slice = std::min(current_slice + radius, max_right);
+
+ // Accumulate 2D In-Map values
+ float16x8_t accu = vdupq_n_f16(0.f);
+ for(int j = first_row; j <= last_row; j++)
+ {
+ // Compute row displacement
+ const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+ const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+ for(int i = first_slice; i <= last_slice; ++i)
+ {
+ accu = vaddq_f16(accu, vld1q_f16(reinterpret_cast<const float16_t *>(input_squared_ptr + i * input_squared_stride)));
+ }
+ }
+
+ const float16x8_t norm_f16 = vpowq_f16(vaddq_f16(kappa_vec, vmulq_f16(coeff_vec, accu)), beta_vec_f16);
+ const float16x8_t normalized_pixel = vmulq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), vinvq_f16(norm_f16));
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), normalized_pixel);
+ },
+ input, input_squared, output);
+ }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+ else
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
-template <unsigned int dim, bool do_2D_norm>
+template <DataType dt, unsigned int dim, bool do_2D_norm>
void NENormalizationLayerKernel::normalize_fixed_point(const Window &window)
{
Iterator input(_input, window);
@@ -171,44 +294,89 @@
const int fixed_point_position = _input->info()->fixed_point_position();
- const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position);
- const qint8x16_t beta_vec = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position);
- const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position);
-
- execute_window_loop(window, [&](const Coordinates & id)
+ if(dt == DataType::QS8)
{
- // Get range to normalize
- const int current_row = do_2D_norm ? id[dim_y] : 0;
- const int current_slice = id[dim];
- const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
- const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
- const int first_slice = std::max(current_slice - radius, min_left);
- const int last_slice = std::min(current_slice + radius, max_right);
+ const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position);
+ const qint8x16_t beta_vec = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position);
+ const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position);
- // Accumulate 2D In-Map values
- qint8x16_t accu = vdupq_n_qs8(0);
- for(int j = first_row; j <= last_row; ++j)
+ execute_window_loop(window, [&](const Coordinates & id)
{
- // Compute row displacement
- const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
- const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
- for(int i = first_slice; i <= last_slice; ++i)
- {
- accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast<const qint8_t *>(input_squared_ptr + i * input_squared_stride)));
- }
- }
+ // Get range to normalize
+ const int current_row = do_2D_norm ? id[dim_y] : 0;
+ const int current_slice = id[dim];
+ const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+ const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+ const int first_slice = std::max(current_slice - radius, min_left);
+ const int last_slice = std::min(current_slice + radius, max_right);
- // Normalize
- const qint8x16_t accu_scale = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position);
- const qint8x16_t normalized = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position);
- const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), normalized, fixed_point_position);
- vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), normalized_pixel);
- },
- input, input_squared, output);
+ // Accumulate 2D In-Map values
+ qint8x16_t accu = vdupq_n_qs8(0);
+ for(int j = first_row; j <= last_row; ++j)
+ {
+ // Compute row displacement
+ const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+ const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+ for(int i = first_slice; i <= last_slice; ++i)
+ {
+ accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast<const qint8_t *>(input_squared_ptr + i * input_squared_stride)));
+ }
+ }
+
+ // Normalize
+ const qint8x16_t accu_scale = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position);
+ const qint8x16_t normalized = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position);
+ const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), normalized, fixed_point_position);
+ vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), normalized_pixel);
+ },
+ input, input_squared, output);
+ }
+ else if(dt == DataType::QS16)
+ {
+ const qint16x8_t coeff_vec = vdupq_n_qs16_f32(_norm_info.scale_coeff(), fixed_point_position);
+ const qint16x8_t beta_vec = vdupq_n_qs16_f32(_norm_info.beta(), fixed_point_position);
+ const qint16x8_t kappa_vec = vdupq_n_qs16_f32(_norm_info.kappa(), fixed_point_position);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get range to normalize
+ const int current_row = do_2D_norm ? id[dim_y] : 0;
+ const int current_slice = id[dim];
+ const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+ const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+ const int first_slice = std::max(current_slice - radius, min_left);
+ const int last_slice = std::min(current_slice + radius, max_right);
+
+ // Accumulate 2D In-Map values
+ qint16x8_t accu = vdupq_n_qs16(0);
+ for(int j = first_row; j <= last_row; ++j)
+ {
+ // Compute row displacement
+ const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+ const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+ for(int i = first_slice; i <= last_slice; ++i)
+ {
+ accu = vqaddq_qs16(accu, vld1q_qs16(reinterpret_cast<const qint16_t *>(input_squared_ptr + i * input_squared_stride)));
+ }
+ }
+
+ // Normalize
+ const qint16x8_t accu_scale = vqmlaq_qs16(kappa_vec, coeff_vec, accu, fixed_point_position);
+ const qint16x8_t normalized = vqpowq_qs16(accu_scale, beta_vec, fixed_point_position);
+ const qint16x8_t normalized_pixel = vdivq_qs16(vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr())), normalized, fixed_point_position);
+ vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), normalized_pixel);
+ },
+ input, input_squared, output);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
-void NENormalizationLayerKernel::run(const Window &window)
+void NENormalizationLayerKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index aa8c7a1..19d45e2 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -38,6 +38,10 @@
#include <cstdint>
#include <cstdlib>
+#if ARM_COMPUTE_ENABLE_FP16
+#include <arm_fp16.h> // needed for float16_t
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
using namespace arm_compute;
namespace arm_compute
@@ -127,20 +131,100 @@
template <bool is_scale255, bool is_sat>
void mul_QS8_QS8_QS8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
{
- // n is the exponent of the scaling factor, that is scale = 1/2^n. Currently, we only support scaling factor equal to 1 => n = 0.
- ARM_COMPUTE_ERROR_ON_MSG(n != 0, "Scaling factor different than 1 not supported for 8-bit fixed-point pixel-wise multiplication");
- ARM_COMPUTE_UNUSED(n);
-
- const auto input1 = static_cast<const qint8_t *__restrict>(input1_ptr);
- const auto input2 = static_cast<const qint8_t *__restrict>(input2_ptr);
const auto output = static_cast<qint8_t *__restrict>(output_ptr);
- const qint8x16_t ta1 = vld1q_qs8(input1);
- const qint8x16_t ta2 = vld1q_qs8(input2);
+ const qint8x16_t ta1 = vld1q_qs8(static_cast<const qint8_t *__restrict>(input1_ptr));
+ const qint8x16_t ta2 = vld1q_qs8(static_cast<const qint8_t *__restrict>(input2_ptr));
- qint8x16_t res = (is_sat) ? vqmulq_qs8(ta1, ta2, fixed_point_position) : vmulq_qs8(ta1, ta2, fixed_point_position);
+ if(is_scale255)
+ {
+ qint16x8_t tmp1_high = vmovl_s8(vget_high_s8(ta1));
+ qint16x8_t tmp1_low = vmovl_s8(vget_low_s8(ta1));
+ const qint16x8_t tmp2_high = vmovl_s8(vget_high_s8(ta2));
+ const qint16x8_t tmp2_low = vmovl_s8(vget_low_s8(ta2));
- vst1q_s8(output, res);
+ const float32x4x2_t scale255_f32 =
+ {
+ {
+ scale255_constant_f32q,
+ scale255_constant_f32q
+ }
+ };
+ const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position);
+
+ tmp1_high = vmulq_qs16(tmp1_high, tmp2_high, fixed_point_position);
+ tmp1_low = vmulq_qs16(tmp1_low, tmp2_low, fixed_point_position);
+ tmp1_high = vmulq_qs16(tmp1_high, scale255, fixed_point_position);
+ tmp1_low = vmulq_qs16(tmp1_low, scale255, fixed_point_position);
+
+ if(is_sat)
+ {
+ vst1q_qs8(output, vcombine_s8(vqmovn_s16(tmp1_low), vqmovn_s16(tmp1_high)));
+ }
+ else
+ {
+ vst1q_qs8(output, vcombine_s8(vmovn_s16(tmp1_low), vmovn_s16(tmp1_high)));
+ }
+ }
+ else
+ {
+ const qint8x16_t vn = vdupq_n_s8(-n);
+ qint8x16_t res = ta2;
+
+ if(is_sat)
+ {
+ res = vqshlq_s8(vqmulq_qs8(ta1, res, fixed_point_position), vn);
+ }
+ else
+ {
+ res = vshlq_s8(vmulq_qs8(ta1, res, fixed_point_position), vn);
+ }
+ vst1q_qs8(output, res);
+ }
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_QS16_QS16_QS16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
+{
+ const qint16x8x2_t ta1 = vld2q_qs16(static_cast<const qint16_t *__restrict>(input1_ptr));
+ qint16x8x2_t res = vld2q_qs16(static_cast<const qint16_t *__restrict>(input2_ptr));
+
+ if(is_scale255)
+ {
+ const float32x4x2_t scale255_f32 =
+ {
+ {
+ scale255_constant_f32q,
+ scale255_constant_f32q
+ }
+ };
+ const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position);
+ if(is_sat)
+ {
+ res.val[0] = vqmulq_qs16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position);
+ res.val[1] = vqmulq_qs16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position);
+ }
+ else
+ {
+ res.val[0] = vmulq_qs16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position);
+ res.val[1] = vmulq_qs16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position);
+ }
+ }
+ else
+ {
+ const qint16x8_t vn = vdupq_n_s16(-n);
+ if(is_sat)
+ {
+ res.val[0] = vqshlq_s16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn);
+ res.val[1] = vqshlq_s16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn);
+ }
+ else
+ {
+ res.val[0] = vshlq_s16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn);
+ res.val[1] = vshlq_s16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn);
+ }
+ }
+ vst2q_s16(static_cast<qint16_t *__restrict>(output_ptr), res);
}
template <bool is_scale255, bool is_sat>
@@ -249,6 +333,33 @@
}
template <bool is_scale255, bool is_sat>
+void mul_F16_F16_F16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ const auto input1 = static_cast<const float16_t *__restrict>(input1_ptr);
+ const auto input2 = static_cast<const float16_t *__restrict>(input2_ptr);
+ const auto output = static_cast<float16_t *__restrict>(output_ptr);
+ const float16x8x2_t ta1 = vld2q_f16(input1);
+ const float16x8x2_t ta2 = vld2q_f16(input2);
+ const float16x8_t scale_vec = vdupq_n_f16(scale);
+ const float16x8x2_t result =
+ {
+ {
+ vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
+ vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
+ }
+ };
+ vst2q_f16(output, result);
+#else /* ARM_COMPUTE_ENABLE_FP16 */
+ ARM_COMPUTE_UNUSED(input1_ptr);
+ ARM_COMPUTE_UNUSED(input2_ptr);
+ ARM_COMPUTE_UNUSED(output_ptr);
+ ARM_COMPUTE_UNUSED(scale);
+ ARM_COMPUTE_ERROR("Not supported. Recompile the library with arch=arm64-v8.2-a.");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
+template <bool is_scale255, bool is_sat>
void mul_U8_U8_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
{
const auto input1 = static_cast<const uint8_t *__restrict>(input1_ptr);
@@ -333,16 +444,43 @@
void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+ // Auto initialize output if not initialized
+ {
+ set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+ if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output->info(), Format::S16);
+ }
+ else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
+ else if(input1->info()->data_type() == DataType::F16 || input2->info()->data_type() == DataType::F16)
+ {
+ set_format_if_unknown(*output->info(), Format::F16);
+ }
+ else if(input1->info()->data_type() == DataType::QS8 && input2->info()->data_type() == DataType::QS8)
+ {
+ set_data_type_if_unknown(*output->info(), DataType::QS8);
+ set_fixed_point_position_if_zero(*output->info(), input1->info()->fixed_point_position());
+ }
+ }
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
"Output can only be U8 if both inputs are U8");
- if(output->info()->data_type() == DataType::QS8 || input1->info()->data_type() == DataType::QS8 || output->info()->data_type() == DataType::QS8)
+ if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
{
- // All data types must be QS8
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input1, input2, output);
+ // Check that all data types are the same and all fixed-point positions are the same
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
+ // Check if scale is representable in fixed-point with the provided settings
+ ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(scale, input1);
}
_input1 = input1;
@@ -457,6 +595,22 @@
_func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<false, true> : &mul_QS8_QS8_QS8_n<false, false>;
}
}
+ else if(DataType::QS16 == dt_input1 && DataType::QS16 == dt_input2 && DataType::QS16 == dt_output)
+ {
+ if(is_scale_255)
+ {
+ _func_q_int = is_sat ? &mul_QS16_QS16_QS16_n<true, true> : &mul_QS16_QS16_QS16_n<true, false>;
+ }
+ else
+ {
+ _func_q_int = is_sat ? &mul_QS16_QS16_QS16_n<false, true> : &mul_QS16_QS16_QS16_n<false, false>;
+ }
+ }
+ else if(DataType::F16 == dt_input1 && DataType::F16 == dt_input2 && DataType::F16 == dt_output)
+ {
+ _func_float = &mul_F16_F16_F16_n<false, false>;
+ _func_int = nullptr;
+ }
else if(DataType::F32 == dt_input1 && DataType::F32 == dt_input2 && DataType::F32 == dt_output)
{
_func_float = &mul_F32_F32_F32_n<false, false>;
@@ -486,8 +640,9 @@
INEKernel::configure(win);
}
-void NEPixelWiseMultiplicationKernel::run(const Window &window)
+void NEPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 30b67b6..b97564e 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -29,6 +29,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
@@ -36,7 +37,9 @@
#include <algorithm>
#include <arm_neon.h>
+#include <cmath>
#include <limits>
+#include <set>
#include <string>
#include <tuple>
@@ -47,24 +50,37 @@
inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
const int pad_x, const int pad_y, const int stride_x, const int stride_y)
{
- int start_x = id.x() * stride_x - pad_x;
- int start_y = id.y() * stride_y - pad_y;
- int end_x = std::min(start_x + pool_size, upper_bound_w);
- int end_y = std::min(start_y + pool_size, upper_bound_h);
+ const int start_x = id.x() * stride_x - pad_x;
+ const int start_y = id.y() * stride_y - pad_y;
+ const int end_x = std::min(start_x + pool_size, upper_bound_w);
+ const int end_y = std::min(start_y + pool_size, upper_bound_h);
return 1.f / ((end_y - start_y) * (end_x - start_x));
}
inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
{
- static std::array<qint8_t, 10> scale_values_q8 =
+ static const std::array<qint8_t, 10> scale_values_q8 =
{ { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
const int start_x = id.x() * stride_x - pad_x;
const int start_y = id.y() * stride_y - pad_y;
const int end_x = std::min(start_x + pool_size, upper_bound_w);
const int end_y = std::min(start_y + pool_size, upper_bound_h);
const int val = ((end_y - start_y) * (end_x - start_x));
- return scale_values_q8[val] >> (7 - fixed_point_position);
+ return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position));
+}
+
+inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
+ int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
+{
+ static std::array<qint16_t, 10> scale_values_q16 =
+ { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } };
+ const int start_x = id.x() * stride_x - pad_x;
+ const int start_y = id.y() * stride_y - pad_y;
+ const int end_x = std::min(start_x + pool_size, upper_bound_w);
+ const int end_y = std::min(start_y + pool_size, upper_bound_h);
+ const int val = ((end_y - start_y) * (end_x - start_x));
+ return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
}
} // namespace
@@ -80,34 +96,44 @@
void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
{
- int pool_pad_x = 0;
- int pool_pad_y = 0;
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- unsigned int pooled_w = 0;
- unsigned int pooled_h = 0;
- PoolingType pool_type = pool_info.pool_type();
- int pool_size = pool_info.pool_size();
- const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
- DimensionRoundingType pool_round = pad_stride_info.round();
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ unsigned int pooled_w = 0;
+ unsigned int pooled_h = 0;
+ PoolingType pool_type = pool_info.pool_type();
+ int pool_size = pool_info.pool_size();
+ const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
- ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size);
+ static const std::set<int> supported_pool_sizes = { 2, 3, 7 };
+ ARM_COMPUTE_UNUSED(supported_pool_sizes);
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->info()->data_type()));
+ ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end());
+ ARM_COMPUTE_ERROR_ON(7 == pool_size && input->info()->data_type() != DataType::F32);
ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
- ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_type == PoolingType::AVG && input->info()->fixed_point_position() > 6);
- ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_stride_x > 2);
+ ARM_COMPUTE_ERROR_ON(is_data_type_fixed_point(input->info()->data_type()) && pool_stride_x > 2);
// Check output dimensions
std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
- pool_size, pool_stride_x, pool_stride_y,
- pool_pad_x, pool_pad_y, pool_round);
- ARM_COMPUTE_UNUSED(pooled_w);
- ARM_COMPUTE_UNUSED(pooled_h);
+ pool_size, pool_size, pool_info.pad_stride_info());
+
+ // Output auto initialization if not yet initialized
+ {
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ output_shape.set(0, pooled_w);
+ output_shape.set(1, pooled_h);
+
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+ }
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
unsigned int num_elems_read_per_iteration = 0;
@@ -118,12 +144,72 @@
switch(input->info()->data_type())
{
case DataType::QS8:
- num_elems_read_per_iteration = 16;
- num_elems_processed_per_iteration = (pool_size == 2) ? 8 : 7;
- num_elems_horizontal_window = 8;
+ num_elems_read_per_iteration = 16;
+ switch(pool_size)
+ {
+ case 2:
+ num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
+ break;
+ case 3:
+ num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Pooling size not supported");
+ break;
+ }
+ num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
break;
+ case DataType::QS16:
+ num_elems_read_per_iteration = 8;
+ switch(pool_size)
+ {
+ case 2:
+ num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
+ break;
+ case 3:
+ num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Pooling size not supported");
+ }
+ num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
+ break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ switch(pool_size)
+ {
+ case 2:
+ num_elems_read_per_iteration = 16;
+ num_elems_processed_per_iteration = 8;
+ num_elems_horizontal_window = 8;
+ break;
+ case 3:
+ num_elems_read_per_iteration = 4;
+ num_elems_processed_per_iteration = 1;
+ num_elems_horizontal_window = 1;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Pooling size not supported");
+ break;
+ }
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
case DataType::F32:
- num_elems_read_per_iteration = (pool_size == 2) ? 2 : 4; // We use vload4 for pooling3
+ switch(pool_size)
+ {
+ case 2:
+ num_elems_read_per_iteration = 2;
+ break;
+ case 3:
+ num_elems_read_per_iteration = 4; // We use vload4 for pooling3
+ break;
+ case 7:
+ num_elems_read_per_iteration = 8; // We use vload8 for pooling7
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Pooling size not supported");
+ break;
+ }
num_elems_processed_per_iteration = 1;
num_elems_horizontal_window = 1;
break;
@@ -152,21 +238,145 @@
case 2:
if(input->info()->data_type() == DataType::QS8)
{
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ }
+ else if(input->info()->data_type() == DataType::QS16)
+ {
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ }
+ else if(input->info()->data_type() == DataType::F16)
+ {
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG>;
+ break;
+ case PoolingType::L2:
+ _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
else if(input->info()->data_type() == DataType::F32)
{
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG>;
+ break;
+ case PoolingType::L2:
+ _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
break;
case 3:
if(input->info()->data_type() == DataType::QS8)
{
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ }
+ else if(input->info()->data_type() == DataType::QS16)
+ {
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ }
+ else if(input->info()->data_type() == DataType::F16)
+ {
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG>;
+ break;
+ case PoolingType::L2:
+ _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
else if(input->info()->data_type() == DataType::F32)
{
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG>;
+ break;
+ case PoolingType::L2:
+ _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ }
+ break;
+ case 7:
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG>;
+ break;
+ case PoolingType::L2:
+ _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
}
break;
default:
@@ -207,7 +417,8 @@
{
const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
- qint8x8_t res = {};
+ qint8x8_t lower_res = {};
+ qint8x8_t upper_res = {};
if(pooling_type == PoolingType::AVG)
{
// Calculate scale
@@ -216,24 +427,175 @@
// Perform pooling
const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
- res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
+ lower_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
+ if(pool_stride_x == 1)
+ {
+ const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1);
+ upper_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position);
+ }
}
else
{
const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
- res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
+ lower_res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
+ if(pool_stride_x == 1)
+ {
+ const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1);
+ upper_res = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted));
+ }
}
- vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
+ if(pool_stride_x == 1)
+ {
+ const qint8x8x2_t res = { { lower_res, upper_res } };
+ vst2_s8(reinterpret_cast<qint8_t *>(output.ptr()), res);
+ }
+ else
+ {
+ vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), lower_res);
+ }
},
input, output);
}
template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
+ const int fixed_point_position = _input->info()->fixed_point_position();
+ constexpr int pool_size = 2;
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+ const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+ const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+ const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
+ const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
+ qint16x4_t lower_res = {};
+ qint16x4_t upper_res = {};
+ if(pooling_type == PoolingType::AVG)
+ {
+ // Calculate scale
+ const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+ const qint16x4_t scale_vec = vdup_n_qs16(scale);
+
+ // Perform pooling
+ const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data);
+ lower_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position);
+ if(pool_stride_x == 1)
+ {
+ const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1);
+ upper_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position);
+ }
+ }
+ else
+ {
+ const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data);
+ lower_res = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data));
+ if(pool_stride_x == 1)
+ {
+ const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1);
+ upper_res = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted));
+ }
+ }
+ if(pool_stride_x == 1)
+ {
+ const qint16x4x2_t res = { { lower_res, upper_res } };
+ vst2_s16(reinterpret_cast<qint16_t *>(output.ptr()), res);
+ }
+ else
+ {
+ vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), lower_res);
+ }
+ },
+ input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ constexpr const int pool_size = 3;
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+ const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+ const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+ const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+ const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
+ float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
+ float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+ float16x4_t res = {};
+
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ top_data = vmul_f16(top_data, top_data);
+ middle_data = vmul_f16(middle_data, middle_data);
+ bottom_data = vmul_f16(bottom_data, bottom_data);
+ }
+
+ if(pooling_type != PoolingType::MAX)
+ {
+ // Calculate scale
+ const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+ const float16x4_t scale_v = vdup_n_f16(scale);
+ // Perform pooling
+ const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
+ res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
+ res = vmul_f16(vpadd_f16(res, res), scale_v);
+ }
+ else
+ {
+ const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
+ res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
+ res = vpmax_f16(res, res);
+ }
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ res = vinv_f16(vinvsqrt_f16(res));
+ }
+
+ *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
+ },
+ input, output);
+#else /* ARM_COMPUTE_ENABLE_FP16 */
+ ARM_COMPUTE_UNUSED(window_input);
+ ARM_COMPUTE_UNUSED(window);
+ ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
constexpr int pool_size = 2;
int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
@@ -246,10 +608,81 @@
execute_window_loop(window, [&](const Coordinates & id)
{
- const float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
- const float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
- float32x2_t res = {};
- if(pooling_type == PoolingType::AVG)
+ auto top_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
+ auto bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+ float16x8_t res = {};
+
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ top_data.val[0] = vmulq_f16(top_data.val[0], top_data.val[0]);
+ top_data.val[1] = vmulq_f16(top_data.val[1], top_data.val[1]);
+ bottom_data.val[0] = vmulq_f16(bottom_data.val[0], bottom_data.val[0]);
+ bottom_data.val[1] = vmulq_f16(bottom_data.val[1], bottom_data.val[1]);
+ }
+
+ if(pooling_type != PoolingType::MAX)
+ {
+ const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+ const float16x8_t scale_v = vdupq_n_f16(scale);
+ res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
+ }
+ else
+ {
+ res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
+ }
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ res = vinvq_f16(vinvsqrtq_f16(res));
+ }
+
+ // Store result
+ vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
+ },
+ input, output);
+#else /* ARM_COMPUTE_ENABLE_FP16 */
+ ARM_COMPUTE_UNUSED(window_input);
+ ARM_COMPUTE_UNUSED(window);
+ ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ constexpr int pool_size = 2;
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+ const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+ const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+ const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ float32x2_t top_data = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+ float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+ float32x2_t res = {};
+ float final_res = 0;
+
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ top_data = vmul_f32(top_data, top_data);
+ bottom_data = vmul_f32(bottom_data, bottom_data);
+ }
+
+ if(pooling_type != PoolingType::MAX)
{
// Calculate scale
float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
@@ -264,7 +697,16 @@
const float32x2_t max_data = vmax_f32(top_data, bottom_data);
res = vpmax_f32(max_data, max_data);
}
- *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
+ final_res = vget_lane_f32(res, 0);
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ final_res = sqrt(final_res);
+ }
+
+ // Store result
+ *(reinterpret_cast<float *>(output.ptr())) = final_res;
},
input, output);
}
@@ -299,8 +741,7 @@
if(pooling_type == PoolingType::AVG)
{
// Calculate scale
- const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
- const qint8x8_t scale_vec = vdup_n_qs8(scale);
+ const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
// Perform pooling for stride 2
const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
@@ -311,13 +752,16 @@
{
const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
+ const qint8x8_t scale_vec = vdup_n_qs8(scale);
res = vtbl2_s8(table, lookup_val);
+ res = vqmul_qs8(res, scale_vec, fixed_point_position);
+ vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
}
else
{
- res = vget_low_s8(final_sum);
+ const qint8x16_t scale_vec = vdupq_n_qs8(scale);
+ vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position));
}
- res = vqmul_qs8(res, scale_vec, fixed_point_position);
}
else
{
@@ -331,25 +775,29 @@
const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
res = vtbl2_s8(table, lookup_val);
+ vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
}
else
{
- res = vget_low_s8(final_max);
+ vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), final_max);
}
}
- vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
},
input, output);
}
template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
- constexpr const int pool_size = 3;
- int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+ const int fixed_point_position = _input->info()->fixed_point_position();
+ constexpr int pool_size = 3;
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
@@ -361,13 +809,92 @@
execute_window_loop(window, [&](const Coordinates & id)
{
- const float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
- const float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
- const float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
- float32x2_t res = {};
+ const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
+ const auto middle_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_middle_ptr + input.offset()));
+ const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
+
if(pooling_type == PoolingType::AVG)
{
// Calculate scale
+ const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+
+ // Perform pooling for stride 2
+ const qint16x8_t sum_data = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
+ const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1);
+ const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2);
+ const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3);
+ if(pool_stride_x == 2)
+ {
+ const qint16x4_t tmp = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) };
+ const qint16x4_t scale_vec = vdup_n_qs16(scale);
+ vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position));
+ }
+ else
+ {
+ const qint16x8_t scale_vec = vdupq_n_qs16(scale);
+ vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position));
+ }
+ }
+ else
+ {
+ const qint16x8_t max_data = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data);
+ const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1);
+ const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2);
+ const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3);
+
+ if(pool_stride_x == 2)
+ {
+ const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) };
+ vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), tmp);
+ }
+ else
+ {
+ vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), final_max);
+ }
+ }
+ },
+ input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ constexpr const int pool_size = 3;
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+ const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+ const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+ const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+ const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+ float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
+ float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+ float32x2_t res = {};
+ float final_res = 0;
+
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ top_data = vmulq_f32(top_data, top_data);
+ middle_data = vmulq_f32(middle_data, middle_data);
+ bottom_data = vmulq_f32(bottom_data, bottom_data);
+ }
+
+ if(pooling_type != PoolingType::MAX)
+ {
+ // Calculate scale
float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
const float32x2_t scale_v = vdup_n_f32(scale);
@@ -382,30 +909,133 @@
res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
res = vpmax_f32(res, res);
}
- *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
+ final_res = vget_lane_f32(res, 0);
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ final_res = sqrt(final_res);
+ }
+
+ // Store result
+ *(reinterpret_cast<float *>(output.ptr())) = final_res;
},
input, output);
}
-void NEPoolingLayerKernel::run(const Window &window)
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window)
{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ constexpr const int pool_size = 7;
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+ const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+ std::array<const uint8_t *, pool_size> input_ptrs{ {} };
+ for(int i = 0; i < pool_size; ++i)
+ {
+ input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + i));
+ }
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ float32x2_t res = {};
+ float final_res = 0.f;
+ if(pooling_type != PoolingType::MAX)
+ {
+ // Calculate scale
+ float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+ const float32x2_t scale_v = vdup_n_f32(scale);
+
+ // Perform pooling
+ float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+ data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+ }
+ float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
+ for(int i = 1; i < pool_size; ++i)
+ {
+ data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+ data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+ }
+ sum_data = vaddq_f32(sum_data, data.val[0]);
+ sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+ }
+ res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
+ res = vmul_f32(vpadd_f32(res, res), scale_v);
+ }
+ else
+ {
+ float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+ for(int i = 1; i < pool_size; ++i)
+ {
+ const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+ max_data = vmax2q_f32(max_data, data);
+ }
+ res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
+ res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
+ res = vpmax_f32(res, res);
+ }
+ final_res = vget_lane_f32(res, 0);
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ final_res = sqrt(final_res);
+ }
+
+ // Store result
+ *(reinterpret_cast<float *>(output.ptr())) = final_res;
+ },
+ input, output);
+}
+
+void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
- unsigned int pool_stride_x, pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
+ const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
// Set step for input in x and y direction for the input
Window window_input(window);
unsigned int window_x_inc = 0;
- if(_input->info()->data_type() == DataType::QS8)
+ switch(_input->info()->data_type())
{
- window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
- }
- else
- {
- window_x_inc = pool_stride_x;
+ case DataType::QS8:
+ case DataType::QS16:
+ case DataType::F16:
+ {
+ window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+ break;
+ }
+ case DataType::F32:
+ {
+ window_x_inc = pool_stride_x;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
new file mode 100644
index 0000000..bff79f0
--- /dev/null
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+NEQuantizationLayerKernel::NEQuantizationLayerKernel()
+ : _input(nullptr), _output(nullptr), _min_max(nullptr)
+{
+}
+
+void NEQuantizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::U8, 0);
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+ _input = input;
+ _output = output;
+ _min_max = min_max;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+ // Configure window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowStatic min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+
+ // Update window and padding
+ update_window_and_padding(win, input_access, output_access, min_max_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ INEKernel::configure(win);
+}
+
+void NEQuantizationLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ Window window_input_output(window);
+ window_input_output.collapse_if_possible(INEKernel::window(), 3);
+ window_input_output.set(3, Window::Dimension(0, 1, 1));
+
+ Window window_min_max;
+ window_min_max.use_tensor_dimensions(_min_max->info()->tensor_shape());
+ window_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
+ window_min_max.collapse_if_possible(INEKernel::window(), 1);
+
+ Iterator input(_input, window_input_output);
+ Iterator output(_output, window_input_output);
+ Iterator min_max(_min_max, window_min_max);
+
+ execute_window_loop(window_min_max, [&](const Coordinates & id_batch)
+ {
+ // Get the min and max
+ float min = *(reinterpret_cast<const float *>(min_max.ptr()) + 0);
+ float max = *(reinterpret_cast<const float *>(min_max.ptr()) + 1);
+
+ // Saturate the result if min = max
+ if(min == max)
+ {
+ min = 0.0f;
+ max = 1.0f;
+ }
+
+ const float32x4_t vmin = vdupq_n_f32(min);
+ const float32x4_t inv_range = vdupq_n_f32(1.0f / (max - min));
+ const float32x4_t quantization_max = vdupq_n_f32(255.0f);
+ const float32x4_t quantization_mul = vdupq_n_f32(256.0f);
+
+ // Uniformly map values to range 8bit integers, i.e. [min, max] -> [0, 255]
+ execute_window_loop(window_input_output, [&](const Coordinates & id)
+ {
+ // Get the input values
+ const auto input_ptr = reinterpret_cast<const float *>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
+ float32x4x2_t val = vld2q_f32(input_ptr);
+
+ // Map float values to range [0.0, 1.0]
+ val.val[0] = vsubq_f32(val.val[0], vmin);
+ val.val[1] = vsubq_f32(val.val[1], vmin);
+ val.val[0] = vmulq_f32(val.val[0], inv_range);
+ val.val[1] = vmulq_f32(val.val[1], inv_range);
+
+ // Quantize
+ val.val[0] = vmulq_f32(val.val[0], quantization_mul);
+ val.val[1] = vmulq_f32(val.val[1], quantization_mul);
+ val.val[0] = vminq_f32(val.val[0], quantization_max);
+ val.val[1] = vminq_f32(val.val[1], quantization_max);
+
+ const uint32x4_t val_u32_low = vcvtq_u32_f32(val.val[0]);
+ const uint32x4_t val_u32_high = vcvtq_u32_f32(val.val[1]);
+ const uint16x4x2_t val_u16 = vzip_u16(vmovn_u32(val_u32_low), vmovn_u32(val_u32_high));
+
+ const uint8x8_t quantized = vmovn_u16(vcombine_u16(val_u16.val[0], val_u16.val[1]));
+
+ // Store the quantized values
+ auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr() + id_batch[1] * _output->info()->strides_in_bytes()[3]);
+ vst1_u8(output_ptr, quantized);
+ },
+ input, output);
+ },
+ min_max);
+}
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
new file mode 100644
index 0000000..a209a52
--- /dev/null
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <cfloat>
+#include <cmath>
+
+using namespace arm_compute;
+
+NEROIPoolingLayerKernel::NEROIPoolingLayerKernel()
+ : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
+{
+}
+
+void NEROIPoolingLayerKernel::configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, rois, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
+ ARM_COMPUTE_ERROR_ON(rois->num_values() == 0);
+
+ // Output auto inizialitation if not yet initialized
+ TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values());
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+
+ // Set instance variables
+ _input = input;
+ _rois = rois;
+ _output = output;
+ _pool_info = pool_info;
+
+ // Configure kernel window
+ Window window;
+ window.set(Window::DimX, Window::Dimension(0, rois->num_values()));
+ window.set(Window::DimY, Window::Dimension(0, 1));
+
+ AccessWindowStatic input_access(input->info(),
+ input->info()->valid_region().start(0),
+ input->info()->valid_region().start(1),
+ input->info()->valid_region().end(0),
+ input->info()->valid_region().end(1));
+ AccessWindowStatic output_access(output->info(), 0, 0, pool_info.pooled_width(), pool_info.pooled_height());
+
+ update_window_and_padding(window, input_access, output_access);
+ output_access.set_valid_region(window, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ INEKernel::configure(window);
+}
+
+void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ const int roi_list_start = window.x().start();
+ const int roi_list_end = window.x().end();
+ const int width = _input->info()->dimension(Window::DimX);
+ const int height = _input->info()->dimension(Window::DimY);
+ const int fms = _input->info()->dimension(Window::DimZ);
+ const int pooled_w = _pool_info.pooled_width();
+ const int pooled_h = _pool_info.pooled_height();
+ const float spatial_scale = _pool_info.spatial_scale();
+
+ for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
+ {
+ const ROI &curr_roi = _rois->at(roi_indx);
+
+ // Scale ROI
+ const int roi_batch = curr_roi.batch_idx;
+ const int roi_anchor_x = support::cpp11::round(curr_roi.rect.x * spatial_scale);
+ const int roi_anchor_y = support::cpp11::round(curr_roi.rect.y * spatial_scale);
+ const int roi_width = std::max(support::cpp11::round(curr_roi.rect.width * spatial_scale), 1.f);
+ const int roi_height = std::max(support::cpp11::round(curr_roi.rect.height * spatial_scale), 1.f);
+
+ // Iterate through all feature maps
+ for(int fm = 0; fm < fms; ++fm)
+ {
+ // Iterate through all output pixels
+ for(int py = 0; py < pooled_h; ++py)
+ {
+ for(int px = 0; px < pooled_w; ++px)
+ {
+ auto region_start_x = static_cast<int>(std::floor((static_cast<float>(px) / pooled_w) * roi_width));
+ auto region_end_x = static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
+ auto region_start_y = static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
+ auto region_end_y = static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
+
+ region_start_x = std::min(std::max(region_start_x + roi_anchor_x, 0), width);
+ region_end_x = std::min(std::max(region_end_x + roi_anchor_x, 0), width);
+ region_start_y = std::min(std::max(region_start_y + roi_anchor_y, 0), height);
+ region_end_y = std::min(std::max(region_end_y + roi_anchor_y, 0), height);
+
+ // Iterate through the pooling region
+ if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+ {
+ *reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = 0;
+ }
+ else
+ {
+ float curr_max = -FLT_MAX;
+ for(int j = region_start_y; j < region_end_y; ++j)
+ {
+ for(int i = region_start_x; i < region_end_x; ++i)
+ {
+ const auto val = *reinterpret_cast<const float *>(_input->ptr_to_element(Coordinates(i, j, fm, roi_batch)));
+ curr_max = std::max(val, curr_max);
+ }
+ }
+ *reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = curr_max;
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
new file mode 100644
index 0000000..1a50ed8
--- /dev/null
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace
+{
+template <class F>
+class Reducer
+{
+public:
+ static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f)
+ {
+ // Set out window
+ Window out_window(window);
+ out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ // Get first input and output slices
+ Window in_slice = window.first_slice_window_1D();
+ Window out_slice = out_window.first_slice_window_1D();
+
+ do
+ {
+ Iterator in(input, in_slice);
+ Iterator out(output, out_slice);
+
+ f(in, out, in_slice, out_slice);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
+ }
+};
+
+struct SumsqOpX
+{
+ inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice)
+ {
+ ARM_COMPUTE_UNUSED(out_slice);
+ float32x4_t vec_sum_value = vdupq_n_f32(0.f);
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const float *>(input.ptr());
+ const float32x4_t vec_elements = vld1q_f32(in_ptr);
+ vec_sum_value = vaddq_f32(vmulq_f32(vec_elements, vec_elements), vec_sum_value);
+ },
+ input);
+
+ float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
+ carry_addition = vpadd_f32(carry_addition, carry_addition);
+
+ *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(carry_addition, 0);
+ }
+};
+
+void reduce_sumsq(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
+{
+ switch(axis)
+ {
+ case 0:
+ return Reducer<SumsqOpX>::reduceX(window, input, output, SumsqOpX());
+ default:
+ ARM_COMPUTE_ERROR("Unsupported reduction axis");
+ }
+}
+} // namespace
+
+NEReductionOperationKernel::NEReductionOperationKernel()
+ : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE), _border_size()
+{
+}
+
+BorderSize NEReductionOperationKernel::border_size() const
+{
+ return _border_size;
+}
+
+void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+
+ // Calculate output shape and set if empty
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ output_shape.set(axis, 1);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+ unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
+
+ _input = input;
+ _output = output;
+ _border_size = (axis == 0) ? BorderSize(0, num_elems_processed_per_iteration - (input->info()->dimension(0) % num_elems_processed_per_iteration), 0, 0) : BorderSize();
+ _op = op;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ INEKernel::configure(win);
+}
+
+void NEReductionOperationKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ switch(_op)
+ {
+ case ReductionOperation::SUM_SQUARE:
+ reduce_sumsq(window, _input, _output, _reduction_axis);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported reduction operation.");
+ }
+}
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index c3c44a5..83004ae 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp
@@ -192,32 +192,33 @@
const uint8_t *in_ptr = in.ptr();
uint8x8_t tmp0 = vdup_n_u8(0);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
uint8x8_t tmp1 = vdup_n_u8(0);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
},
in, out, mapx, mapy);
}
-void NERemapKernel::run(const Window &window)
+void NERemapKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
new file mode 100644
index 0000000..8e69252
--- /dev/null
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+template <typename T>
+inline void reshape_tensor(const Window &window, const ITensor *input, ITensor *output)
+{
+ const TensorShape &input_shape = input->info()->tensor_shape();
+ const TensorShape &output_shape = output->info()->tensor_shape();
+ Coordinates output_coord{};
+
+ window.collapse_if_possible(window, 3);
+ Iterator in(input, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ output_coord = index2coords(output_shape, coords2index(input_shape, id));
+ *reinterpret_cast<T *>(output->ptr_to_element(output_coord)) = *reinterpret_cast<T *>(in.ptr());
+ },
+ in);
+}
+} // namespace
+
+void NEReshapeLayerKernel::configure(const ITensor *input, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size());
+
+ _input = input;
+ _output = output;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowStatic output_access(output->info(), 0, 0, output->info()->tensor_shape().x(), output->info()->tensor_shape().y());
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ INEKernel::configure(win);
+}
+
+void NEReshapeLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+ switch(_input->info()->data_type())
+ {
+ case DataType::U8:
+ case DataType::S8:
+ case DataType::QS8:
+ reshape_tensor<uint8_t>(window, _input, _output);
+ break;
+ case DataType::U16:
+ case DataType::S16:
+ case DataType::QS16:
+ case DataType::F16:
+ reshape_tensor<uint16_t>(window, _input, _output);
+ break;
+ case DataType::U32:
+ case DataType::S32:
+ case DataType::F32:
+ reshape_tensor<uint32_t>(window, _input, _output);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type!");
+ }
+}
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index fd2978d..6634d4b 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -50,8 +50,10 @@
void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON(output == input);
if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
@@ -79,6 +81,16 @@
_dx = dx;
_dy = dy;
+ /* Compute the ratio between source width/height and destination width/height */
+ const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
+ const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+
+ /* Area interpolation behaves as Nearest Neighbour in case of up-sampling */
+ if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+ {
+ policy = InterpolationPolicy::NEAREST_NEIGHBOR;
+ }
+
switch(policy)
{
case InterpolationPolicy::NEAREST_NEIGHBOR:
@@ -104,13 +116,18 @@
}
constexpr unsigned int num_elems_processed_per_iteration = 16;
- const int border_offset = (border_undefined) ? 0 : border_size().left;
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input_access(input->info(), -border_offset, -border_offset, input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset);
- AccessWindowHorizontal offsets_access(offsets->info(), 0, num_elems_processed_per_iteration);
+ const ValidRegion &input_valid_region = input->info()->valid_region();
+
+ // Reads can occur within the valid region of the input
+ AccessWindowStatic input_access(input->info(),
+ input_valid_region.anchor[0] - border_size().left, input_valid_region.anchor[1] - border_size().top,
+ input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
+ input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
+ AccessWindowHorizontal offsets_access(offsets == nullptr ? nullptr : offsets->info(), 0, num_elems_processed_per_iteration);
AccessWindowHorizontal dx_access(dx == nullptr ? nullptr : dx->info(), 0, num_elems_processed_per_iteration);
AccessWindowHorizontal dy_access(dy == nullptr ? nullptr : dy->info(), 0, num_elems_processed_per_iteration);
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
@@ -122,8 +139,7 @@
dy_access,
output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
+ output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()), output->info()->tensor_shape(), policy, border_size(), border_undefined));
INEKernel::configure(win);
}
@@ -164,8 +180,8 @@
const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
const uint8_t *const in_ptr = in.ptr();
- const size_t in_yi = (id.y() + 0.5f) * hr;
- const size_t offset_row = in_yi * input_stride;
+ const int in_yi = std::floor((id.y() + 0.5f) * hr);
+ const int offset_row = in_yi * input_stride;
tmp = vsetq_lane_u8(in_ptr[offsets_ptr[0] + offset_row], tmp, 0);
tmp = vsetq_lane_u8(in_ptr[offsets_ptr[1] + offset_row], tmp, 1);
@@ -203,8 +219,8 @@
{
const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
- const size_t in_yi = (id.y() + 0.5f) * hr;
- const size_t offset_row = in_yi * input_stride;
+ const int in_yi = (id.y() + 0.5f) * hr;
+ const int offset_row = in_yi * input_stride;
tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
@@ -229,6 +245,50 @@
in, offsets, out);
break;
}
+ case DataType::F32:
+ {
+ float32x4x4_t tmp =
+ {
+ {
+ vdupq_n_f32(0),
+ vdupq_n_f32(0),
+ vdupq_n_f32(0),
+ vdupq_n_f32(0)
+ }
+ };
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+
+ const int in_yi = (id.y() + 0.5f) * hr;
+ const int offset_row = in_yi * input_stride;
+
+ tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
+ tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1);
+ tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 2);
+ tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 3);
+
+ tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
+ tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 1);
+ tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 2);
+ tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 3);
+
+ tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[2], 0);
+ tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[2], 1);
+ tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[2], 2);
+ tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[2], 3);
+
+ tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[3], 0);
+ tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[3], 1);
+ tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[3], 2);
+ tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[3], 3);
+
+ vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
+ },
+ in, offsets, out);
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Not supported");
break;
@@ -237,7 +297,7 @@
void NEScaleKernel::scale_bilinear(const Window &window)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::S16, DataType::F32);
// Compute the ratio between source height and destination height
const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
@@ -264,41 +324,140 @@
Iterator dy(_dy, win_off);
/* Input image stride */
- const size_t in_stride = _input->info()->strides_in_bytes()[1];
+ const size_t in_stide_in_bytes = _input->info()->strides_in_bytes()[1];
+ const size_t in_stride = in_stide_in_bytes / _input->info()->element_size();
- execute_window_loop(window, [&](const Coordinates & id)
+ switch(_input->info()->data_type())
{
- const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
- const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
- const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
- const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
+ case DataType::U8:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+ const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
+ const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
+ const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
- const size_t in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
- const size_t offset_row = in_yi * in_stride;
+ const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
+ const int offset_row = in_yi * in_stide_in_bytes;
- uint8x8_t tmp0 = vdup_n_u8(0);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+ uint8x8_t tmp0 = vdup_n_u8(0);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
- uint8x8_t tmp1 = vdup_n_u8(0);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+ uint8x8_t tmp1 = vdup_n_u8(0);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
- vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
- },
- in, offsets, dx, dy, out);
+ vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+ },
+ in, offsets, dx, dy, out);
+ break;
+ }
+ case DataType::S16:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+ const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
+ const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
+
+ const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
+ const int offset_row = in_yi * in_stide_in_bytes;
+
+ int16x8x2_t tmp =
+ {
+ {
+ vdupq_n_s16(0),
+ vdupq_n_s16(0)
+ }
+ };
+
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7);
+
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7);
+
+ vst2q_s16(reinterpret_cast<int16_t *>(out.ptr()), tmp);
+ },
+ in, offsets, dx, dy, out);
+ break;
+ }
+ case DataType::F32:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+ const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
+ const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
+
+ const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
+ const int offset_row = in_yi * in_stide_in_bytes;
+
+ float32x4x4_t tmp =
+ {
+ {
+ vdupq_n_f32(0),
+ vdupq_n_f32(0),
+ vdupq_n_f32(0),
+ vdupq_n_f32(0)
+ }
+ };
+
+ tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+ tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 1);
+ tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 2);
+ tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 3);
+
+ tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+ tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 1);
+ tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 2);
+ tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 3);
+
+ tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[2], 0);
+ tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[2], 1);
+ tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[2], 2);
+ tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[2], 3);
+
+ tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[3], 0);
+ tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[3], 1);
+ tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[3], 2);
+ tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[3], 3);
+
+ vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
+ },
+ in, offsets, dx, dy, out);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
}
void NEScaleKernel::scale_area(const Window &window)
@@ -349,8 +508,9 @@
in, out);
}
-void NEScaleKernel::run(const Window &window)
+void NEScaleKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
index 183df1e..f23c31b 100644
--- a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
@@ -135,8 +135,9 @@
return BorderSize(1);
}
-void NEScharr3x3Kernel::run(const Window &window)
+void NEScharr3x3Kernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/NESobel3x3Kernel.cpp b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
index ab08a1c..5a80630 100644
--- a/src/core/NEON/kernels/NESobel3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
@@ -88,8 +88,9 @@
INEKernel::configure(win);
}
-void NESobel3x3Kernel::run(const Window &window)
+void NESobel3x3Kernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/NESobel5x5Kernel.cpp b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
index 488eee1..30e7817 100644
--- a/src/core/NEON/kernels/NESobel5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
@@ -90,8 +90,9 @@
INEKernel::configure(win);
}
-void NESobel5x5HorKernel::run(const Window &window)
+void NESobel5x5HorKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
@@ -261,8 +262,9 @@
INEKernel::configure(win);
}
-void NESobel5x5VertKernel::run(const Window &window)
+void NESobel5x5VertKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.cpp b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
index 9761942..4cc80f8 100644
--- a/src/core/NEON/kernels/NESobel7x7Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
@@ -193,8 +193,9 @@
INEKernel::configure(win);
}
-void NESobel7x7HorKernel::run(const Window &window)
+void NESobel7x7HorKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
@@ -351,8 +352,9 @@
INEKernel::configure(win);
}
-void NESobel7x7VertKernel::run(const Window &window)
+void NESobel7x7VertKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index 942662e..648dac4 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -26,7 +26,6 @@
#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/NEON/NEMath.h"
@@ -43,6 +42,104 @@
namespace
{
+void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window)
+{
+ Window in_slice = window.first_slice_window_1D();
+
+ Window window_max(window);
+ window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+ Window max_slice = window_max.first_slice_window_1D();
+
+ do
+ {
+ Iterator input(in, in_slice);
+ Iterator output(out, max_slice);
+
+ qint8x16_t vec_max = vdupq_n_s8(std::numeric_limits<qint8_t>::lowest());
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const qint8_t *>(input.ptr());
+ const qint8x16_t current_value = vld1q_qs8(in_ptr);
+ vec_max = vmaxq_qs8(vec_max, current_value);
+ },
+ input);
+
+ qint8x8_t carry_max = vpmax_qs8(vget_high_s8(vec_max), vget_low_s8(vec_max));
+ carry_max = vpmax_qs8(carry_max, carry_max);
+ carry_max = vpmax_qs8(carry_max, carry_max);
+ carry_max = vpmax_qs8(carry_max, carry_max);
+
+ *(reinterpret_cast<qint8_t *>(output.ptr())) = vget_lane_s8(carry_max, 0);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+void logits_1d_max_qs16(const ITensor *in, ITensor *out, const Window &window)
+{
+ Window in_slice = window.first_slice_window_1D();
+
+ Window window_max(window);
+ window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+ Window max_slice = window_max.first_slice_window_1D();
+
+ do
+ {
+ Iterator input(in, in_slice);
+ Iterator output(out, max_slice);
+
+ qint16x8_t vec_max = vdupq_n_qs16(std::numeric_limits<qint16_t>::lowest());
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const qint16_t *>(input.ptr());
+ const qint16x8_t current_value = vld1q_qs16(in_ptr);
+ vec_max = vmaxq_qs16(vec_max, current_value);
+ },
+ input);
+
+ qint16x4_t carry_max = vpmax_qs16(vget_high_qs16(vec_max), vget_low_qs16(vec_max));
+ carry_max = vpmax_qs16(carry_max, carry_max);
+ carry_max = vpmax_qs16(carry_max, carry_max);
+
+ *(reinterpret_cast<qint16_t *>(output.ptr())) = vget_lane_s16(carry_max, 0);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+void logits_1d_max_f16(const ITensor *in, ITensor *out, const Window &window)
+{
+ Window in_slice = window.first_slice_window_1D();
+
+ Window window_max(window);
+ window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+ Window max_slice = window_max.first_slice_window_1D();
+
+ do
+ {
+ Iterator input(in, in_slice);
+ Iterator output(out, max_slice);
+
+ float16x8_t vec_max = vdupq_n_f16(std::numeric_limits<float16_t>::lowest());
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const float16_t *>(input.ptr());
+ const float16x8_t current_value = vld1q_f16(in_ptr);
+ vec_max = vmaxq_f16(vec_max, current_value);
+ },
+ input);
+
+ float16x4_t carry_max = vpmax_f16(vget_high_f16(vec_max), vget_low_f16(vec_max));
+ carry_max = vpmax_f16(carry_max, carry_max);
+ carry_max = vpmax_f16(carry_max, carry_max);
+
+ *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(carry_max, 0);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
void logits_1d_max_f32(const ITensor *in, ITensor *out, const Window &window)
{
Window in_slice = window.first_slice_window_1D();
@@ -73,39 +170,6 @@
}
while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
}
-
-void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window)
-{
- Window in_slice = window.first_slice_window_1D();
-
- Window window_max(window);
- window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
- Window max_slice = window_max.first_slice_window_1D();
-
- do
- {
- Iterator input(in, in_slice);
- Iterator output(out, max_slice);
-
- qint8x16_t vec_max = vdupq_n_s8(-1);
-
- execute_window_loop(in_slice, [&](const Coordinates & id)
- {
- const auto in_ptr = reinterpret_cast<const qint8_t *>(input.ptr());
- const qint8x16_t current_value = vld1q_qs8(in_ptr);
- vec_max = vmaxq_qs8(vec_max, current_value);
- },
- input);
-
- qint8x8_t carry_max = vpmax_qs8(vget_high_s8(vec_max), vget_low_s8(vec_max));
- carry_max = vpmax_qs8(carry_max, carry_max);
- carry_max = vpmax_qs8(carry_max, carry_max);
- carry_max = vpmax_qs8(carry_max, carry_max);
-
- *(reinterpret_cast<int8_t *>(output.ptr())) = vget_lane_s8(carry_max, 0);
- }
- while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
-}
} // namespace
NELogits1DMaxKernel::NELogits1DMaxKernel()
@@ -120,30 +184,46 @@
void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+ // Softmax across the x dimension
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ output_shape.set(0, 1);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
const int input_width = input->info()->valid_region().shape.x();
- unsigned int num_elems_processed_per_iteration = 0;
+ unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
switch(input->info()->data_type())
{
case DataType::QS8:
- _func = &logits_1d_max_qs8;
- num_elems_processed_per_iteration = 16;
+ _func = &logits_1d_max_qs8;
+ break;
+ case DataType::QS16:
+ _func = &logits_1d_max_qs16;
break;
case DataType::F32:
- num_elems_processed_per_iteration = 4;
- _func = &logits_1d_max_f32;
+ _func = &logits_1d_max_f32;
break;
+ case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ _func = &logits_1d_max_f16;
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
default:
ARM_COMPUTE_ERROR("Unsupported data type.");
}
_input = input;
_output = output;
- _border_size = BorderSize(0, input_width % num_elems_processed_per_iteration, 0, 0);
+ _border_size = BorderSize(0, num_elems_processed_per_iteration - (input_width % num_elems_processed_per_iteration), 0, 0);
// Configure kernel window
constexpr unsigned int num_elems_written_per_row = 1;
@@ -159,8 +239,9 @@
INEKernel::configure(win);
}
-void NELogits1DMaxKernel::run(const Window &window)
+void NELogits1DMaxKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -170,67 +251,6 @@
namespace
{
-void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
-{
- Window window_max(window);
- window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
-
- Window max_slice = window_max.first_slice_window_1D();
- Window in_slice = window.first_slice_window_1D();
-
- constexpr int step = 4;
- const int long_steps = in->info()->valid_region().shape.x() / step;
- const int small_steps = in->info()->valid_region().shape.x() % step;
-
- do
- {
- Iterator input(in, in_slice);
- Iterator exp(out, in_slice);
- Iterator _max(max, max_slice);
- Iterator _sum(sum, max_slice);
-
- // Get pointers
- auto in_ptr = reinterpret_cast<const float *>(input.ptr());
- auto exp_ptr = reinterpret_cast<float *>(exp.ptr());
-
- // Init sum to zero
- float32x4_t vec_sum_value = vdupq_n_f32(0.0f);
-
- // Get max value
- const auto max_ptr = reinterpret_cast<const float *>(_max.ptr());
- const float32x4_t vec_max = vdupq_n_f32(*max_ptr);
-
- // Run neon loop
- for(int i = 0; i < long_steps; ++i)
- {
- float32x4_t vec_elements = vld1q_f32(in_ptr);
- vec_elements = vsubq_f32(vec_elements, vec_max);
- vec_elements = vexpq_f32(vec_elements);
-
- vst1q_f32(exp_ptr, vec_elements);
- vec_sum_value = vaddq_f32(vec_elements, vec_sum_value);
-
- in_ptr += step;
- exp_ptr += step;
- }
-
- // Reduce sum
- float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
- carry_addition = vpadd_f32(carry_addition, carry_addition);
- float sum = vget_lane_f32(carry_addition, 0);
-
- // Run remaining elements
- for(int i = 0; i < small_steps; ++i)
- {
- float element = std::exp(in_ptr[i] - *max_ptr);
- exp_ptr[i] = element;
- sum += element;
- }
-
- *(reinterpret_cast<float *>(_sum.ptr())) = sum;
- }
- while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
-}
void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
{
Window window_max(window);
@@ -293,6 +313,190 @@
}
while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
}
+void logits_1d_shift_exp_sum_qs16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+{
+ Window window_max(window);
+ window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ Window max_slice = window_max.first_slice_window_1D();
+ Window in_slice = window.first_slice_window_1D();
+
+ constexpr int step = 4;
+ const int long_steps = in->info()->valid_region().shape.x() / step;
+ const int small_steps = in->info()->valid_region().shape.x() % step;
+ const int fixed_point_position = in->info()->fixed_point_position();
+
+ do
+ {
+ Iterator input(in, in_slice);
+ Iterator exp(out, in_slice);
+ Iterator _max(max, max_slice);
+ Iterator _sum(sum, max_slice);
+
+ // Get pointers
+ auto in_ptr = reinterpret_cast<const qint16_t *>(input.ptr());
+ auto exp_ptr = reinterpret_cast<qint16_t *>(exp.ptr());
+
+ // Init sum to zero
+ qint32x4_t vec_sum_value = vdupq_n_qs32(0);
+
+ // Get max value
+ const auto max_ptr = reinterpret_cast<const qint16_t *>(_max.ptr());
+ const qint16x4_t vec_max = vdup_n_qs16(*max_ptr);
+
+ // Run neon loop
+ for(int i = 0; i < long_steps; ++i)
+ {
+ qint16x4_t vec_elements = vld1_qs16(in_ptr);
+ vec_elements = vqsub_qs16(vec_elements, vec_max);
+ vec_elements = vqexp_qs16(vec_elements, fixed_point_position);
+
+ vst1_qs16(exp_ptr, vec_elements);
+ vec_sum_value = vqaddq_qs32(vec_sum_value, vmovl_s16(vec_elements));
+
+ in_ptr += step;
+ exp_ptr += step;
+ }
+ // Reduce sum
+ qint32x2_t carry_addition = vqadd_qs32(vget_high_s32(vec_sum_value), vget_low_s32(vec_sum_value));
+ qint32_t sum = vget_lane_s32(carry_addition, 0) + vget_lane_s32(carry_addition, 1);
+
+ // Run remaining elements
+ for(int i = 0; i < small_steps; ++i)
+ {
+ qint16_t element = sqexp_qs16(sqsub_qs16(in_ptr[i], *max_ptr), fixed_point_position);
+ exp_ptr[i] = element;
+ sum = sqadd_qs32(sum, element);
+ }
+
+ *(reinterpret_cast<qint16_t *>(_sum.ptr())) = sqmovn_qs32(sum);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+void logits_1d_shift_exp_sum_f16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+{
+ Window window_max(window);
+ window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ Window max_slice = window_max.first_slice_window_1D();
+ Window in_slice = window.first_slice_window_1D();
+
+ constexpr int step = 8;
+ const int long_steps = in->info()->valid_region().shape.x() / step;
+ const int small_steps = in->info()->valid_region().shape.x() % step;
+
+ do
+ {
+ Iterator input(in, in_slice);
+ Iterator exp(out, in_slice);
+ Iterator _max(max, max_slice);
+ Iterator _sum(sum, max_slice);
+
+ // Get pointers
+ auto in_ptr = reinterpret_cast<const float16_t *>(input.ptr());
+ auto exp_ptr = reinterpret_cast<float16_t *>(exp.ptr());
+
+ // Init sum to zero
+ float16x8_t vec_sum_value = vdupq_n_f16(0);
+
+ // Get max value
+ const auto max_ptr = reinterpret_cast<const float16_t *>(_max.ptr());
+ const float16x8_t vec_max = vdupq_n_f16(*max_ptr);
+
+ // Run neon loop
+ for(int i = 0; i < long_steps; ++i)
+ {
+ float16x8_t vec_elements = vld1q_f16(in_ptr);
+ vec_elements = vsubq_f16(vec_elements, vec_max);
+ vec_elements = vexpq_f16(vec_elements);
+
+ vst1q_f16(exp_ptr, vec_elements);
+ vec_sum_value = vaddq_f16(vec_sum_value, vec_elements);
+
+ in_ptr += step;
+ exp_ptr += step;
+ }
+ // Reduce sum
+ const float16x4_t sum_red = vadd_f16(vget_low_f16(vec_sum_value), vget_high_f16(vec_sum_value));
+ const float16x4_t carry_addition = vpadd_f16(sum_red, sum_red);
+ float16_t sum = vget_lane_f16(carry_addition, 0) + vget_lane_f16(carry_addition, 1);
+
+ // Run remaining elements
+ for(int i = 0; i < small_steps; ++i)
+ {
+ const float16_t element = std::exp(static_cast<float>(in_ptr[i] - *max_ptr));
+ exp_ptr[i] = element;
+ sum += element;
+ }
+ *(reinterpret_cast<float16_t *>(_sum.ptr())) = sum;
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+{
+ Window window_max(window);
+ window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ Window max_slice = window_max.first_slice_window_1D();
+ Window in_slice = window.first_slice_window_1D();
+
+ constexpr int step = 4;
+ const int long_steps = in->info()->valid_region().shape.x() / step;
+ const int small_steps = in->info()->valid_region().shape.x() % step;
+
+ do
+ {
+ Iterator input(in, in_slice);
+ Iterator exp(out, in_slice);
+ Iterator _max(max, max_slice);
+ Iterator _sum(sum, max_slice);
+
+ // Get pointers
+ auto in_ptr = reinterpret_cast<const float *>(input.ptr());
+ auto exp_ptr = reinterpret_cast<float *>(exp.ptr());
+
+ // Init sum to zero
+ float32x4_t vec_sum_value = vdupq_n_f32(0.0f);
+
+ // Get max value
+ const auto max_ptr = reinterpret_cast<const float *>(_max.ptr());
+ const float32x4_t vec_max = vdupq_n_f32(*max_ptr);
+
+ // Run neon loop
+ for(int i = 0; i < long_steps; ++i)
+ {
+ float32x4_t vec_elements = vld1q_f32(in_ptr);
+ vec_elements = vsubq_f32(vec_elements, vec_max);
+ vec_elements = vexpq_f32(vec_elements);
+
+ vst1q_f32(exp_ptr, vec_elements);
+ vec_sum_value = vaddq_f32(vec_elements, vec_sum_value);
+
+ in_ptr += step;
+ exp_ptr += step;
+ }
+
+ // Reduce sum
+ float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
+ carry_addition = vpadd_f32(carry_addition, carry_addition);
+ float sum = vget_lane_f32(carry_addition, 0);
+
+ // Run remaining elements
+ for(int i = 0; i < small_steps; ++i)
+ {
+ float element = std::exp(in_ptr[i] - *max_ptr);
+ exp_ptr[i] = element;
+ sum += element;
+ }
+
+ *(reinterpret_cast<float *>(_sum.ptr())) = sum;
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
} //namespace
NELogits1DShiftExpSumKernel::NELogits1DShiftExpSumKernel()
@@ -302,11 +506,16 @@
void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F32, DataType::QS8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, max, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, max, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output, max, sum);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
unsigned int num_elems_processed_per_iteration = input->info()->valid_region().shape.x();
@@ -316,11 +525,20 @@
case DataType::QS8:
_func = &logits_1d_shift_exp_sum_qs8;
break;
+ case DataType::QS16:
+ _func = &logits_1d_shift_exp_sum_qs16;
+ break;
case DataType::F32:
_func = &logits_1d_shift_exp_sum_f32;
break;
+ case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ _func = &logits_1d_shift_exp_sum_f16;
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
default:
ARM_COMPUTE_ERROR("Unsupported data type.");
+ break;
}
_input = input;
@@ -343,8 +561,9 @@
INEKernel::configure(win);
}
-void NELogits1DShiftExpSumKernel::run(const Window &window)
+void NELogits1DShiftExpSumKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -354,36 +573,6 @@
namespace
{
-void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
-{
- Window window_sum(window);
- window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
- Window sum_slice = window_sum.first_slice_window_1D();
- Window in_slice = window.first_slice_window_1D();
-
- do
- {
- Iterator input(in, in_slice);
- Iterator _sum(sum, sum_slice);
- Iterator output(out, in_slice);
-
- const float sum_value = *reinterpret_cast<const float *>(_sum.ptr());
- const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value);
-
- execute_window_loop(in_slice, [&](const Coordinates & id)
- {
- const auto in_ptr = reinterpret_cast<const float *>(input.ptr());
- const auto out_ptr = reinterpret_cast<float *>(output.ptr());
-
- const float32x4_t vec_in = vld1q_f32(in_ptr);
- const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed);
-
- vst1q_f32(out_ptr, normalized_value);
- },
- input, output);
- }
- while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
-}
void logits_1d_norm_qs8(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
{
Window window_sum(window);
@@ -416,6 +605,101 @@
}
while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
}
+void logits_1d_norm_qs16(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
+{
+ Window window_sum(window);
+ window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+ Window sum_slice = window_sum.first_slice_window_1D();
+ Window in_slice = window.first_slice_window_1D();
+
+ const int fixed_point_position = in->info()->fixed_point_position();
+
+ do
+ {
+ Iterator input(in, in_slice);
+ Iterator _sum(sum, sum_slice);
+ Iterator output(out, in_slice);
+
+ const int16_t sum_value = *reinterpret_cast<const qint16_t *>(_sum.ptr());
+ const qint16x8_t vec_sum_inversed = vqrecipq_qs16(vdupq_n_qs16(sum_value), fixed_point_position);
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const qint16_t *>(input.ptr());
+ const auto out_ptr = reinterpret_cast<qint16_t *>(output.ptr());
+
+ const qint16x8_t vec_in = vld1q_qs16(in_ptr);
+ const qint16x8_t normalized_value = vqmulq_qs16(vec_in, vec_sum_inversed, fixed_point_position);
+
+ vst1q_qs16(out_ptr, normalized_value);
+ },
+ input, output);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
+#ifdef ARM_COMPUTE_ENABLE_FP16
+void logits_1d_norm_f16(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
+{
+ Window window_sum(window);
+ window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+ Window sum_slice = window_sum.first_slice_window_1D();
+ Window in_slice = window.first_slice_window_1D();
+
+ do
+ {
+ Iterator input(in, in_slice);
+ Iterator _sum(sum, sum_slice);
+ Iterator output(out, in_slice);
+
+ const float16_t sum_value = *reinterpret_cast<const qint16_t *>(_sum.ptr());
+ const float16x8_t vec_sum_inversed = vdupq_n_f16(1.0f / sum_value);
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const float16_t *>(input.ptr());
+ const auto out_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+ const float16x8_t vec_in = vld1q_f16(in_ptr);
+ const float16x8_t normalized_value = vmulq_f16(vec_in, vec_sum_inversed);
+
+ vst1q_f16(out_ptr, normalized_value);
+ },
+ input, output);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
+{
+ Window window_sum(window);
+ window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+ Window sum_slice = window_sum.first_slice_window_1D();
+ Window in_slice = window.first_slice_window_1D();
+
+ do
+ {
+ Iterator input(in, in_slice);
+ Iterator _sum(sum, sum_slice);
+ Iterator output(out, in_slice);
+
+ const float sum_value = *reinterpret_cast<const float *>(_sum.ptr());
+ const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value);
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const float *>(input.ptr());
+ const auto out_ptr = reinterpret_cast<float *>(output.ptr());
+
+ const float32x4_t vec_in = vld1q_f32(in_ptr);
+ const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed);
+
+ vst1q_f32(out_ptr, normalized_value);
+ },
+ input, output);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
} // namespace
NELogits1DNormKernel::NELogits1DNormKernel()
@@ -425,9 +709,14 @@
void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, sum);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(sum, output);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
_input = input;
@@ -435,20 +724,27 @@
_output = output;
// Configure kernel window
- unsigned int num_elems_processed_per_iteration = 0;
+ unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
switch(input->info()->data_type())
{
case DataType::QS8:
- _func = &logits_1d_norm_qs8;
- num_elems_processed_per_iteration = 16;
+ _func = &logits_1d_norm_qs8;
+ break;
+ case DataType::QS16:
+ _func = &logits_1d_norm_qs16;
break;
case DataType::F32:
- num_elems_processed_per_iteration = 4;
- _func = &logits_1d_norm_f32;
+ _func = &logits_1d_norm_f32;
break;
+ case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ _func = &logits_1d_norm_f16;
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
default:
ARM_COMPUTE_ERROR("Unsupported data type.");
+ break;
}
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
@@ -464,8 +760,9 @@
INEKernel::configure(win);
}
-void NELogits1DNormKernel::run(const Window &window)
+void NELogits1DNormKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NETableLookupKernel.cpp b/src/core/NEON/kernels/NETableLookupKernel.cpp
index f0b58d8..958f4a9 100644
--- a/src/core/NEON/kernels/NETableLookupKernel.cpp
+++ b/src/core/NEON/kernels/NETableLookupKernel.cpp
@@ -133,8 +133,9 @@
INESimpleKernel::configure(input, output, num_num_elems_processed_per_iteration);
}
-void NETableLookupKernel::run(const Window &window)
+void NETableLookupKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEThresholdKernel.cpp b/src/core/NEON/kernels/NEThresholdKernel.cpp
index 7203119..5ef0693 100644
--- a/src/core/NEON/kernels/NEThresholdKernel.cpp
+++ b/src/core/NEON/kernels/NEThresholdKernel.cpp
@@ -119,8 +119,9 @@
input, output);
}
-void NEThresholdKernel::run(const Window &window)
+void NEThresholdKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index 492de8a..1cfaafe 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp
@@ -179,8 +179,9 @@
void NETransposeKernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON(output == nullptr);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
TensorShape output_shape{ input->info()->tensor_shape() };
const size_t w_out = input->info()->dimension(1);
@@ -191,8 +192,9 @@
// Output tensor auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
_input = input;
_output = output;
@@ -231,8 +233,9 @@
INEKernel::configure(win);
}
-void NETransposeKernel::run(const Window &window)
+void NETransposeKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
index 6c90a33..62f4e5d 100644
--- a/src/core/NEON/kernels/NEWarpKernel.cpp
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp
@@ -49,8 +49,14 @@
{
}
-void INEWarpKernel::run(const Window &window)
+BorderSize INEWarpKernel::border_size() const
{
+ return BorderSize(1);
+}
+
+void INEWarpKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
@@ -93,9 +99,9 @@
// Reads can occur within the valid region of the input
AccessWindowStatic input_access(input->info(),
- input_valid_region.anchor[0], input_valid_region.anchor[1],
- input_valid_region.anchor[0] + input_valid_region.shape[0],
- input_valid_region.anchor[1] + input_valid_region.shape[1]);
+ input_valid_region.anchor[0] - border_size().left, input_valid_region.anchor[1] - border_size().top,
+ input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
+ input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
AccessWindowHorizontal output_access(output->info(), 0, 1);
update_window_and_padding(win, input_access, output_access);
@@ -171,7 +177,7 @@
*out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
break;
case InterpolationPolicy::BILINEAR:
- *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+ *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
break;
default:
ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -250,7 +256,7 @@
*out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
break;
case InterpolationPolicy::BILINEAR:
- *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+ *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
break;
default:
ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -330,7 +336,7 @@
*out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
break;
case InterpolationPolicy::BILINEAR:
- *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+ *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
break;
default:
ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -390,7 +396,11 @@
const float start_z0 = M20 * window.x().start();
// Current row
- int y_cur = window.y().start();
+ int y_cur = window.y().start();
+ int z_cur = window.z().start();
+ int d3_cur = window[3].start();
+ int d4_cur = window[4].start();
+ int d5_cur = window[5].start();
// const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing
float const_x0 = M01 * y_cur + M02;
@@ -405,9 +415,13 @@
execute_window_loop(window, [&](const Coordinates & id)
{
// Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
- if(y_cur != id.y())
+ if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
{
- y_cur = id.y();
+ y_cur = id.y();
+ z_cur = id.z();
+ d3_cur = id[3];
+ d4_cur = id[4];
+ d5_cur = id[5];
const_x0 = M01 * y_cur + M02;
const_y0 = M11 * y_cur + M12;
@@ -431,7 +445,7 @@
*out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
break;
case InterpolationPolicy::BILINEAR:
- *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+ *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
break;
default:
ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -484,7 +498,11 @@
const float start_z0 = M20 * window.x().start();
// Current row
- int y_cur = window.y().start();
+ int y_cur = window.y().start();
+ int z_cur = window.z().start();
+ int d3_cur = window[3].start();
+ int d4_cur = window[4].start();
+ int d5_cur = window[5].start();
// const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing
float const_x0 = M01 * y_cur + M02;
@@ -498,10 +516,14 @@
execute_window_loop(window, [&](const Coordinates & id)
{
- // Check if we are processing a new row. If so, update the current row (y_cur), x0, y0 and z0
- if(y_cur != id.y())
+ // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+ if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
{
- y_cur = id.y();
+ y_cur = id.y();
+ z_cur = id.z();
+ d3_cur = id[3];
+ d4_cur = id[4];
+ d5_cur = id[5];
const_x0 = M01 * y_cur + M02;
const_y0 = M11 * y_cur + M12;
@@ -516,7 +538,6 @@
const float yn = y0 / z0;
// Only use input values if xn and yn are within the valid region.
- // Otherwise write the constant border value.
if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x))
{
switch(interpolation)
@@ -525,7 +546,7 @@
*out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
break;
case InterpolationPolicy::BILINEAR:
- *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+ *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
break;
default:
ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -533,7 +554,34 @@
}
else
{
- *out.ptr() = _constant_border_value;
+ switch(interpolation)
+ {
+ case InterpolationPolicy::NEAREST_NEIGHBOR:
+ *out.ptr() = _constant_border_value;
+ break;
+ case InterpolationPolicy::BILINEAR:
+ {
+ const auto xi = clamp<int>(std::floor(xn), min_x - 1, max_x);
+ const auto yi = clamp<int>(std::floor(yn), min_y - 1, max_y);
+ const auto xi_1 = clamp<int>(std::floor(xn + 1), min_x - 1, max_x);
+ const auto yi_1 = clamp<int>(std::floor(yn + 1), min_y - 1, max_y);
+
+ const float dx = xn - std::floor(xn);
+ const float dy = yn - std::floor(yn);
+ const float dx1 = 1.0f - dx;
+ const float dy1 = 1.0f - dy;
+
+ const float a00 = *(in.ptr() + xi + yi * stride);
+ const float a01 = *(in.ptr() + xi_1 + yi * stride);
+ const float a10 = *(in.ptr() + xi + yi_1 * stride);
+ const float a11 = *(in.ptr() + xi_1 + yi_1 * stride);
+
+ *out.ptr() = a00 * (dx1 * dy1) + a01 * (dx * dy1) + a10 * (dx1 * dy) + a11 * (dx * dy);
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Interpolation not supported");
+ }
}
x0 += M00;
@@ -562,7 +610,11 @@
const size_t stride = _input->info()->strides_in_bytes()[1];
// Current row
- int y_cur = window.y().start();
+ int y_cur = window.y().start();
+ int z_cur = window.z().start();
+ int d3_cur = window[3].start();
+ int d4_cur = window[4].start();
+ int d5_cur = window[5].start();
// x0 = M00 * x + M01 * y + M02
// y0 = M10 * x + M11 * y + M12
@@ -596,10 +648,14 @@
execute_window_loop(window, [&](const Coordinates & id)
{
- // Check if we are processing a new row. If so, update the current row (y_cur), x0, y0 and z0
- if(y_cur != id.y())
+ // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+ if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
{
- y_cur = id.y();
+ y_cur = id.y();
+ z_cur = id.z();
+ d3_cur = id[3];
+ d4_cur = id[4];
+ d5_cur = id[5];
const_x0 = M01 * y_cur + M02;
const_y0 = M11 * y_cur + M12;
@@ -614,7 +670,6 @@
const float yn = y0 / z0;
// Only load from (x0, y0) if the point is within the valid region.
- // Otherwise load from the edge of the valid region.
if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x))
{
switch(interpolation)
@@ -623,7 +678,7 @@
*out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
break;
case InterpolationPolicy::BILINEAR:
- *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+ *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
break;
default:
ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -632,10 +687,34 @@
else
{
// Clamp coordinates
- const auto xi = clamp<int>(x0, min_x, max_x - 1);
- const auto yi = clamp<int>(y0, min_y, max_y - 1);
+ const auto xi = clamp<int>(std::floor(xn), min_x, max_x - 1);
+ const auto yi = clamp<int>(std::floor(yn), min_y, max_y - 1);
+ switch(interpolation)
+ {
+ case InterpolationPolicy::NEAREST_NEIGHBOR:
+ *out.ptr() = *(in.ptr() + xi + yi * stride);
+ break;
+ case InterpolationPolicy::BILINEAR:
+ {
+ const auto xi_1 = clamp<int>(std::floor(xn + 1), min_x, max_x - 1);
+ const auto yi_1 = clamp<int>(std::floor(yn + 1), min_y, max_y - 1);
- *out.ptr() = *(in.ptr() + xi + yi * stride);
+ const float dx = xn - std::floor(xn);
+ const float dy = yn - std::floor(yn);
+ const float dx1 = 1.0f - dx;
+ const float dy1 = 1.0f - dy;
+
+ const float a00 = *(in.ptr() + xi + yi * stride);
+ const float a01 = *(in.ptr() + xi_1 + yi * stride);
+ const float a10 = *(in.ptr() + xi + yi_1 * stride);
+ const float a11 = *(in.ptr() + xi_1 + yi_1 * stride);
+
+ *out.ptr() = a00 * (dx1 * dy1) + a01 * (dx * dy1) + a10 * (dx1 * dy) + a11 * (dx * dy);
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Interpolation not supported");
+ }
}
x0 += M00;
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index aa6be44..d52e88c 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -37,7 +37,8 @@
template <typename T>
void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
{
- const unsigned int kernel_size = input->info()->dimension(0);
+ const unsigned int kernel_size_x = input->info()->dimension(0);
+ const unsigned int kernel_size_y = input->info()->dimension(1);
const unsigned int kernel_depth = input->info()->dimension(2);
const unsigned int input_stride_x = input->info()->strides_in_bytes().x();
const unsigned int input_stride_y = input->info()->strides_in_bytes().y();
@@ -61,9 +62,9 @@
// Linearize volume
for(unsigned int d = 0; d < kernel_depth; ++d)
{
- for(unsigned int j = 0; j < kernel_size; ++j)
+ for(unsigned int j = 0; j < kernel_size_y; ++j)
{
- for(unsigned int i = 0; i < kernel_size; ++i)
+ for(unsigned int i = 0; i < kernel_size_x; ++i)
{
*(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(tmp_input_ptr));
tmp_input_ptr += input_stride_x;
@@ -94,62 +95,60 @@
void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != input->info()->dimension(1));
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
-
- TensorShape output_shape{ input->info()->tensor_shape() };
+ const int fixed_point_position = input->info()->fixed_point_position();
+ const DataType dt = input->info()->data_type();
+ const TensorShape &input_shape = input->info()->tensor_shape();
+ TensorShape output_shape{ input_shape };
output_shape.collapse(3);
+
const size_t tmp_dim = output_shape[0];
output_shape.set(0, output_shape[1]);
output_shape.set(1, tmp_dim + (bias != nullptr ? 1 : 0));
- // Set data type and shape for output tensor if not yet configured
- set_data_type_if_unknown(*output->info(), dt);
- set_fixed_point_position_if_zero(*output->info(), fixed_point_position);
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, dt, fixed_point_position);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
if(bias != nullptr)
{
- TensorShape bias_shape{ input->info()->tensor_shape()[3] };
-
- // Set data type and shape for bias tensor if not yet configured
- set_data_type_if_unknown(*bias->info(), dt);
- set_fixed_point_position_if_zero(*bias->info(), fixed_point_position);
- set_shape_if_empty(*bias->info(), bias_shape);
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(bias->info()->tensor_shape(), bias_shape);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F32, DataType::QS8);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, bias);
+ ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (bias->info()->num_dimensions() != 1));
+ ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (bias->info()->num_dimensions() != 2));
+ ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (bias->info()->dimension(0) != input->info()->tensor_shape()[3]));
+ ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (bias->info()->dimension(0) != input->info()->tensor_shape()[3] || bias->info()->dimension(1) != input->info()->tensor_shape()[4]));
}
_input = input;
_bias = bias;
_output = output;
- switch(_input->info()->data_type())
+ switch(_input->info()->element_size())
{
- case DataType::F32:
+ case 4:
{
_func = &weights_reshape<uint32_t>;
break;
}
- case DataType::QS8:
+ case 2:
+ {
+ _func = &weights_reshape<uint16_t>;
+ break;
+ }
+ case 1:
{
_func = &weights_reshape<uint8_t>;
break;
}
default:
{
- ARM_COMPUTE_ERROR_ON("Data type not supported");
+ ARM_COMPUTE_ERROR_ON("Element size not supported");
break;
}
}
@@ -166,8 +165,9 @@
INEKernel::configure(window);
}
-void NEWeightsReshapeKernel::run(const Window &window)
+void NEWeightsReshapeKernel::run(const Window &window, const ThreadInfo &info)
{
+ ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
diff --git a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
new file mode 100644
index 0000000..ad0743b
--- /dev/null
+++ b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
+} // namespace arm_compute
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace arm_compute
+{
+void NEGEMMAArch32Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _workspace = workspace;
+ _alpha = alpha;
+ _beta = beta;
+ _transform_0 = transform_0;
+ _transform_1 = transform_1;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info());
+
+ AccessWindowRectangle output_access(output->info(), 0, 0, 8, 6);
+
+ const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 6);
+ const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 8);
+
+ update_window_and_padding(win,
+ AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
+ AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
+ output_access);
+
+ INEKernel::configure(win);
+}
+
+void NEGEMMAArch32Kernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
+ const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
+ const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
+
+ const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
+
+ const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
+ const int N = _output->info()->tensor_shape().x();
+ const int K = _input0->info()->tensor_shape().x();
+
+ // Only iterate over batches
+ Window win(window);
+ win.set(0, Window::Dimension(0, 1, 1));
+ win.set(1, Window::Dimension(0, 1, 1));
+
+ Iterator in0(_input0, window);
+ Iterator out(_output, window);
+
+ GemmInterleaved<sgemm_8x6, float, float> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1);
+ constexpr size_t alignment = 4096;
+ const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+ void *workspace = _workspace->buffer() + offset;
+ size_t workspace_size = _workspace->info()->total_size();
+
+ if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
+ {
+ ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+ }
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ gemm.execute(reinterpret_cast<const float *>(in0.ptr()), lda,
+ reinterpret_cast<const float *>(in1_ptr), ldb,
+ reinterpret_cast<float *>(out.ptr()), ldc,
+ _alpha, _beta, workspace);
+ },
+ in0, out);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
new file mode 100644
index 0000000..d70524b
--- /dev/null
+++ b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace arm_compute
+{
+void NEGEMMAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _workspace = workspace;
+ _alpha = alpha;
+ _beta = beta;
+ _transform_0 = transform_0;
+ _transform_1 = transform_1;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info());
+
+ AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8);
+
+ const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8);
+ const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
+
+ update_window_and_padding(win,
+ AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
+ AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
+ output_access);
+
+ INEKernel::configure(win);
+}
+
+void NEGEMMAArch64Kernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
+ const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
+ const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
+
+ const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
+
+ const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
+ const int N = _output->info()->tensor_shape().x();
+ const int K = _input0->info()->tensor_shape().x();
+
+ // Only iterate over batches
+ Window win(window);
+ win.set(0, Window::Dimension(0, 1, 1));
+ win.set(1, Window::Dimension(0, 1, 1));
+
+ Iterator in0(_input0, window);
+ Iterator out(_output, window);
+
+ GemmInterleaved<sgemm_12x8, float, float> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1);
+ constexpr size_t alignment = 4096;
+ const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+ void *workspace = _workspace->buffer() + offset;
+ size_t workspace_size = _workspace->info()->total_size();
+
+ if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
+ {
+ ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+ }
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ gemm.execute(reinterpret_cast<const float *>(in0.ptr()), lda,
+ reinterpret_cast<const float *>(in1_ptr), ldb,
+ reinterpret_cast<float *>(out.ptr()), ldc,
+ _alpha, _beta, workspace);
+ },
+ in0, out);
+}
+} // namespace arm_compute
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index 3d07ccb..91a3531 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -244,13 +244,13 @@
{
if(_tensor_shape.total_size() > 0)
{
- required_strides = Strides(stride_x);
+ required_strides = Strides(stride_x, stride_x);
required_total_size = stride_z;
}
break;
}
case 1:
- required_strides = compute_strides(*this, stride_x);
+ required_strides = compute_strides(*this, stride_x, stride_y);
required_total_size = stride_z;
break;
case 2:
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index bf005c1..99d3956 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -156,6 +156,8 @@
{ ActivationLayerInfo::ActivationFunction::LOGISTIC, "LOGISTIC" },
{ ActivationLayerInfo::ActivationFunction::RELU, "RELU" },
{ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, "BRELU" },
+ { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU" },
+ { ActivationLayerInfo::ActivationFunction::LEAKY_RELU, "LRELU" },
{ ActivationLayerInfo::ActivationFunction::SOFT_RELU, "SRELU" },
{ ActivationLayerInfo::ActivationFunction::SQRT, "SQRT" },
{ ActivationLayerInfo::ActivationFunction::SQUARE, "SQUARE" },
@@ -226,6 +228,18 @@
return norm_type_map[type];
}
+const std::string &arm_compute::string_from_pooling_type(PoolingType type)
+{
+ static std::map<PoolingType, const std::string> pool_type_map =
+ {
+ { PoolingType::MAX, "MAX" },
+ { PoolingType::AVG, "AVG" },
+ { PoolingType::L2, "L2" },
+ };
+
+ return pool_type_map[type];
+}
+
std::string arm_compute::lower_string(const std::string &val)
{
std::string res = val;
@@ -233,22 +247,25 @@
return res;
}
-const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height, unsigned int kernel_size,
- unsigned int stride_x, unsigned int stride_y,
- unsigned int pad_x, unsigned int pad_y,
- DimensionRoundingType round_type)
+const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height,
+ unsigned int kernel_width, unsigned int kernel_height,
+ const PadStrideInfo &pad_stride_info)
{
- unsigned int w = 0;
- unsigned int h = 0;
- switch(round_type)
+ const unsigned int pad_x = pad_stride_info.pad().first;
+ const unsigned int pad_y = pad_stride_info.pad().second;
+ const unsigned int stride_x = pad_stride_info.stride().first;
+ const unsigned int stride_y = pad_stride_info.stride().second;
+ unsigned int w = 0;
+ unsigned int h = 0;
+ switch(pad_stride_info.round())
{
case DimensionRoundingType::FLOOR:
- w = static_cast<unsigned int>(std::floor((static_cast<float>(width + 2 * pad_x - kernel_size) / stride_x) + 1));
- h = static_cast<unsigned int>(std::floor((static_cast<float>(height + 2 * pad_y - kernel_size) / stride_y) + 1));
+ w = static_cast<unsigned int>(std::floor((static_cast<float>(width + 2 * pad_x - kernel_width) / stride_x) + 1));
+ h = static_cast<unsigned int>(std::floor((static_cast<float>(height + 2 * pad_y - kernel_height) / stride_y) + 1));
break;
case DimensionRoundingType::CEIL:
- w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + 2 * pad_x - kernel_size) / stride_x) + 1));
- h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + 2 * pad_y - kernel_size) / stride_y) + 1));
+ w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + 2 * pad_x - kernel_width) / stride_x) + 1));
+ h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + 2 * pad_y - kernel_height) / stride_y) + 1));
break;
default:
ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -283,6 +300,7 @@
case DataType::U16:
print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, element_delim);
break;
+ case DataType::QS16:
case DataType::S16:
print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width, element_delim);
break;
@@ -313,6 +331,7 @@
return max_consecutive_elements_display_width_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n);
case DataType::U16:
return max_consecutive_elements_display_width_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n);
+ case DataType::QS16:
case DataType::S16:
return max_consecutive_elements_display_width_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n);
case DataType::U32:
diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index ae2841d..084a325 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp
@@ -60,6 +60,22 @@
}
}
+void arm_compute::error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
+ const arm_compute::Window &full, const arm_compute::Window &window, const int dim)
+{
+ ARM_COMPUTE_UNUSED(function);
+ ARM_COMPUTE_UNUSED(file);
+ ARM_COMPUTE_UNUSED(line);
+ ARM_COMPUTE_UNUSED(dim);
+
+ full.validate();
+ window.validate();
+
+ ARM_COMPUTE_ERROR_ON_LOC(window[dim].start() != 0, function, file, line);
+ ARM_COMPUTE_ERROR_ON_LOC(window[dim].start() != full[dim].start(), function, file, line);
+ ARM_COMPUTE_ERROR_ON_LOC(full[dim].end() != window[dim].end(), function, file, line);
+}
+
void arm_compute::error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
const arm_compute::Coordinates &pos, unsigned int max_dim)
{
@@ -168,7 +184,7 @@
ARM_COMPUTE_UNUSED(kernel);
ARM_COMPUTE_ERROR_ON_LOC(kernel == nullptr, function, file, line);
- ARM_COMPUTE_ERROR_ON_LOC_MSG((kernel->window().x().start() == kernel->window().x().end()) && (kernel->window().x().end() == 0),
+ ARM_COMPUTE_ERROR_ON_LOC_MSG((kernel->window().x().start() == kernel->window().x().end()) && (kernel->window().x().end() == 0) && (kernel->window().x().step() == 0),
function, file, line,
"This kernel hasn't been configured.");
}
diff --git a/src/graph/CL/CLMap.cpp b/src/graph/CL/CLMap.cpp
new file mode 100644
index 0000000..4892b96
--- /dev/null
+++ b/src/graph/CL/CLMap.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/CL/CLMap.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+using namespace arm_compute::graph;
+
+CLMap::CLMap(Tensor *tensor, bool blocking)
+ : _tensor(dynamic_cast<arm_compute::CLTensor *>(tensor->tensor())), _blocking(blocking)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
+}
+
+void CLMap::run()
+{
+ _tensor->map(_blocking);
+}
diff --git a/src/graph/CL/CLUnmap.cpp b/src/graph/CL/CLUnmap.cpp
new file mode 100644
index 0000000..ec7d865
--- /dev/null
+++ b/src/graph/CL/CLUnmap.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/CL/CLUnmap.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+using namespace arm_compute::graph;
+
+CLUnmap::CLUnmap(Tensor *tensor)
+ : _tensor(dynamic_cast<arm_compute::CLTensor *>(tensor->tensor()))
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
+}
+
+void CLUnmap::run()
+{
+ _tensor->unmap();
+}
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
new file mode 100644
index 0000000..525506f
--- /dev/null
+++ b/src/graph/Graph.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/Graph.h"
+
+#include "arm_compute/graph/CL/CLMap.h"
+#include "arm_compute/graph/CL/CLUnmap.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+
+using namespace arm_compute::graph;
+
+struct Stage
+{
+ Tensor *_input;
+ Tensor *_output;
+ std::unique_ptr<arm_compute::IFunction> _function;
+};
+
+struct Graph::Private
+{
+public:
+ /** Finalizes the current node's configuration
+ *
+ * @param _next_hint Device execution hint
+ */
+ void configure(Hint _next_hint);
+
+ /** Sets whether to enable information print out
+ *
+ * @param[in] is_enabled Set to true if need info printed out
+ */
+ void set_info_enablement(bool is_enabled);
+
+ std::vector<Stage> _pipeline{};
+ std::vector<std::unique_ptr<Tensor>> _tensors{};
+ std::vector<std::unique_ptr<INode>> _nodes{};
+ Hint _current_hint{ Hint::DONT_CARE };
+ Hint _next_hint{ Hint::DONT_CARE };
+ std::unique_ptr<Tensor> _graph_input{ nullptr };
+ std::unique_ptr<Tensor> _graph_output{ nullptr };
+ std::unique_ptr<INode> _current_node{ nullptr };
+ Tensor *_current_output{ nullptr };
+ bool _info_enabled{ false };
+
+private:
+ Tensor *_current_input{ nullptr };
+ Hint _previous_hint{ Hint::DONT_CARE };
+};
+
+Graph::~Graph() //NOLINT
+{
+ //Can't use =default because the destructor must be defined after Graph::Private's definition
+}
+
+Graph::Graph()
+ : _pimpl{ new Private() }
+{
+}
+
+void Graph::run()
+{
+ while(true)
+ {
+ if(!_pimpl->_graph_input->call_accessor())
+ {
+ return;
+ }
+
+ for(auto &stage : _pimpl->_pipeline)
+ {
+ stage._function->run();
+ }
+
+ if(!_pimpl->_graph_output->call_accessor())
+ {
+ return;
+ }
+ }
+}
+
+//Finalize current node's configuration
+void Graph::Private::configure(Hint _next_hint)
+{
+ ARM_COMPUTE_ERROR_ON(_current_node == nullptr);
+ ARM_COMPUTE_ERROR_ON(_graph_input == nullptr);
+
+ // Is it the first node of the graph ?
+ if(_current_input == nullptr)
+ {
+ _graph_input->set_target(_current_hint);
+ _current_input = _graph_input.get();
+ _previous_hint = _current_hint; // For the first node just assume the previous node was of the same type as this one
+ }
+
+ //Automatic output configuration ?
+ if(_current_output == nullptr)
+ {
+ _tensors.push_back(arm_compute::support::cpp14::make_unique<Tensor>(TensorInfo()));
+ _current_output = _tensors.back().get();
+ }
+
+ // If either the writer or reader node needs OpenCL then use OpenCL memory:
+ if((_next_hint == Hint::OPENCL || _current_hint == Hint::OPENCL))
+ {
+ _current_output->set_target(Hint::OPENCL);
+ }
+ else
+ {
+ _current_output->set_target(Hint::NEON);
+ }
+
+ // Map input if needed
+ std::unique_ptr<arm_compute::IFunction> func = _current_node->instantiate_node(_current_hint, _current_input->tensor(), _current_output->tensor());
+ _current_input->allocate();
+
+ if(_current_input->target() == Hint::OPENCL)
+ {
+ if(_previous_hint == Hint::NEON)
+ {
+ ARM_COMPUTE_ERROR_ON(_current_hint == Hint::NEON);
+ _pipeline.push_back({ _current_input, _current_input, arm_compute::support::cpp14::make_unique<CLUnmap>(_current_input) });
+ }
+ if(_current_hint == Hint::NEON)
+ {
+ ARM_COMPUTE_ERROR_ON(_previous_hint == Hint::NEON);
+ _pipeline.push_back({ _current_input, _current_input, arm_compute::support::cpp14::make_unique<CLMap>(_current_input, true) });
+ }
+ }
+
+ _pipeline.push_back({ _current_input, _current_output, std::move(func) });
+
+ _current_input = _current_output;
+ _current_output = nullptr;
+ _previous_hint = _current_hint;
+ _current_hint = _next_hint;
+}
+
+void Graph::Private::set_info_enablement(bool is_enabled)
+{
+ _info_enabled = is_enabled;
+}
+
+void Graph::add_node(std::unique_ptr<INode> node)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_pimpl->_graph_input == nullptr, "The graph's input must be set before the first node is added");
+ ARM_COMPUTE_ERROR_ON_MSG(_pimpl->_graph_output != nullptr, "Nothing can be added after the output tensor");
+ //Trigger the creation of the current Node:
+
+ Hint _next_hint = node->override_hint(_pimpl->_next_hint);
+ ARM_COMPUTE_ERROR_ON(_next_hint == Hint::DONT_CARE);
+ if(_pimpl->_current_node)
+ {
+ //Finalize the previous Node:
+ _pimpl->configure(_pimpl->_next_hint);
+
+ if(_pimpl->_info_enabled)
+ {
+ _pimpl->_current_node->print_info();
+ }
+ }
+ else
+ {
+ // If that's the first node then use the same Hint before and after the node.
+ _pimpl->_current_hint = _next_hint;
+ }
+ if(_pimpl->_current_node)
+ {
+ _pimpl->_nodes.push_back(std::move(_pimpl->_current_node));
+ }
+ _pimpl->_current_node = std::move(node);
+}
+void Graph::set_hint(Hint hint)
+{
+ _pimpl->_next_hint = hint;
+}
+
+void Graph::set_info_enablement(bool is_enabled)
+{
+ _pimpl->set_info_enablement(is_enabled);
+}
+
+//Add a tensor with an Accessor (i.e either the input or output of the graph)
+void Graph::add_tensor(std::unique_ptr<Tensor> tensor)
+{
+ // If it's the first Tensor added then it will be the input of the Graph.
+ if(_pimpl->_graph_input == nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON(_pimpl->_graph_output != nullptr);
+ ARM_COMPUTE_ERROR_ON(_pimpl->_current_node != nullptr);
+ _pimpl->_graph_input = std::move(tensor);
+ }
+ else
+ {
+ // Else it will be the output of the Graph
+ ARM_COMPUTE_ERROR_ON(_pimpl->_graph_output != nullptr);
+ ARM_COMPUTE_ERROR_ON(_pimpl->_current_node == nullptr);
+ _pimpl->_graph_output = std::move(tensor);
+ _pimpl->_current_output = _pimpl->_graph_output.get();
+
+ // Finalize the graph by configuring the last Node of the graph:
+ _pimpl->configure(_pimpl->_current_hint); // Ignore _next_hint as this is the last node, and just use the same hint as before this node.
+ _pimpl->_graph_output->allocate();
+ }
+}
+
+void Graph::set_temp(TensorInfo &&tmp)
+{
+ ARM_COMPUTE_ERROR_ON(_pimpl->_graph_input == nullptr);
+ ARM_COMPUTE_ERROR_ON(_pimpl->_graph_output != nullptr);
+ ARM_COMPUTE_ERROR_ON_MSG(_pimpl->_current_output != nullptr, "TensorInfo for temporary tensor already set");
+
+ _pimpl->_tensors.push_back(arm_compute::support::cpp14::make_unique<Tensor>(std::move(tmp)));
+ _pimpl->_current_output = _pimpl->_tensors.back().get();
+}
+
+Graph &arm_compute::graph::operator<<(Graph &graph, TensorInfo &&info)
+{
+ graph.set_temp(std::move(info));
+ return graph;
+}
+
+Graph &arm_compute::graph::operator<<(Graph &graph, Tensor &&tensor)
+{
+ graph.add_tensor(arm_compute::support::cpp14::make_unique<Tensor>(std::move(tensor)));
+ return graph;
+}
+
+Graph &arm_compute::graph::operator<<(Graph &graph, Hint hint)
+{
+ graph.set_hint(hint);
+ return graph;
+}
diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
new file mode 100644
index 0000000..6b25022
--- /dev/null
+++ b/src/graph/INode.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/graph/INode.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Validate.h"
+
+#include <ostream>
+
+using namespace arm_compute::graph;
+
+Hint INode::override_hint(Hint hint) const
+{
+ if(hint == Hint::OPENCL && !opencl_is_available())
+ {
+ hint = Hint::DONT_CARE;
+ }
+ hint = node_override_hint(hint);
+ ARM_COMPUTE_ERROR_ON(hint == Hint::OPENCL && !opencl_is_available());
+ return hint;
+}
+Hint INode::node_override_hint(Hint hint) const
+{
+ return hint == Hint::DONT_CARE ? Hint::NEON : hint;
+}
diff --git a/src/graph/Tensor.cpp b/src/graph/Tensor.cpp
new file mode 100644
index 0000000..c534ae0
--- /dev/null
+++ b/src/graph/Tensor.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/graph/Tensor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename TensorType>
+std::unique_ptr<ITensor> initialise_tensor(TensorInfo &info)
+{
+ auto tensor = arm_compute::support::cpp14::make_unique<TensorType>();
+ tensor->allocator()->init(info);
+ return std::move(tensor);
+}
+
+template <typename TensorType>
+void tensor_allocate(ITensor &tensor)
+{
+ auto itensor = dynamic_cast<TensorType *>(&tensor);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(itensor);
+ itensor->allocator()->allocate();
+}
+} // namespace
+
+Tensor::Tensor(TensorInfo &&info)
+ : _target(Hint::DONT_CARE), _info(info), _accessor(nullptr), _tensor(nullptr)
+{
+}
+
+Tensor::Tensor(Tensor &&src) noexcept
+ : _target(src._target),
+ _info(std::move(src._info)),
+ _accessor(std::move(src._accessor)),
+ _tensor(std::move(src._tensor))
+{
+}
+
+void Tensor::set_info(TensorInfo &&info)
+{
+ _info = info;
+}
+
+bool Tensor::call_accessor()
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_accessor.get());
+ auto cl_tensor = dynamic_cast<arm_compute::CLTensor *>(_tensor.get());
+ if(cl_tensor != nullptr && cl_tensor->buffer() == nullptr)
+ {
+ cl_tensor->map();
+ }
+ bool retval = _accessor->access_tensor(*_tensor);
+ if(cl_tensor != nullptr)
+ {
+ cl_tensor->unmap();
+ }
+ return retval;
+}
+
+ITensor *Tensor::tensor()
+{
+ return _tensor.get();
+}
+
+const TensorInfo &Tensor::info() const
+{
+ return _info;
+}
+
+ITensor *Tensor::set_target(Hint target)
+{
+ if(_tensor != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON(target != _target);
+ }
+ else
+ {
+ switch(target)
+ {
+ case Hint::OPENCL:
+ _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
+ break;
+ case Hint::NEON:
+ _tensor = initialise_tensor<arm_compute::Tensor>(_info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Invalid Hint");
+ }
+ _target = target;
+ }
+ return _tensor.get();
+}
+
+void Tensor::allocate()
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor.get());
+ switch(_target)
+ {
+ case Hint::OPENCL:
+ tensor_allocate<arm_compute::CLTensor>(*_tensor);
+ break;
+ case Hint::NEON:
+ tensor_allocate<arm_compute::Tensor>(*_tensor);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Invalid Hint");
+ }
+}
+
+void Tensor::allocate_and_fill_if_needed()
+{
+ allocate();
+ if(_accessor != nullptr)
+ {
+ call_accessor();
+ }
+}
+
+Hint Tensor::target() const
+{
+ return _target;
+}
diff --git a/src/graph/nodes/ActivationLayer.cpp b/src/graph/nodes/ActivationLayer.cpp
new file mode 100644
index 0000000..b71e22c
--- /dev/null
+++ b/src/graph/nodes/ActivationLayer.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ActivationLayer.h"
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename ActivationType, typename TensorType, Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
+{
+ auto activation = arm_compute::support::cpp14::make_unique<ActivationType>();
+ activation->configure(
+ dynamic_cast<TensorType *>(input),
+ dynamic_cast<TensorType *>(output),
+ activation_info);
+
+ return std::move(activation);
+}
+
+template <Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
+{
+ return instantiate_function<arm_compute::CLActivationLayer, arm_compute::CLTensor, Hint::OPENCL>(input, output, activation_info);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
+{
+ return instantiate_function<arm_compute::NEActivationLayer, arm_compute::Tensor, Hint::NEON>(input, output, activation_info);
+}
+} // namespace
+
+ActivationLayer::ActivationLayer(const ActivationLayerInfo activation_info)
+ : _activation_info(activation_info)
+{
+}
+
+std::unique_ptr<arm_compute::IFunction> ActivationLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+{
+ std::unique_ptr<arm_compute::IFunction> func;
+ _hint = hint;
+ _input = input;
+ _output = output;
+
+ if(_hint == Hint::OPENCL)
+ {
+ func = instantiate<Hint::OPENCL>(input, output, _activation_info);
+ }
+ else
+ {
+ func = instantiate<Hint::NEON>(input, output, _activation_info);
+ }
+ return func;
+}
+
+void ActivationLayer::print_info()
+{
+ if(_hint == Hint::OPENCL)
+ {
+ std::cout << "Instantiating CLActivationLayer";
+ }
+ else
+ {
+ std::cout << "Instantiating NEActivationLayer";
+ }
+
+ std::cout << " Data Type: " << _input->info()->data_type()
+ << " Input shape: " << _input->info()->tensor_shape()
+ << " Output shape: " << _output->info()->tensor_shape()
+ << " Activation function: " << _activation_info.activation()
+ << " a: " << _activation_info.a()
+ << " b: " << _activation_info.b()
+ << std::endl;
+}
diff --git a/src/graph/nodes/ConvolutionLayer.cpp b/src/graph/nodes/ConvolutionLayer.cpp
new file mode 100644
index 0000000..b80bf93
--- /dev/null
+++ b/src/graph/nodes/ConvolutionLayer.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ConvolutionLayer.h"
+
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename ConvolutionType, typename TensorType, Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+ bool weights_are_loaded = weights.tensor() != nullptr;
+ bool biases_are_loaded = biases.tensor() != nullptr;
+
+ auto conv = arm_compute::support::cpp14::make_unique<ConvolutionType>();
+ conv->configure(
+ dynamic_cast<TensorType *>(input),
+ dynamic_cast<TensorType *>(weights.set_target(hint)),
+ dynamic_cast<TensorType *>(biases.set_target(hint)),
+ dynamic_cast<TensorType *>(output),
+ conv_info, weights_info);
+ if(!weights_are_loaded)
+ {
+ weights.allocate_and_fill_if_needed();
+ }
+ if(!biases_are_loaded)
+ {
+ biases.allocate_and_fill_if_needed();
+ }
+
+ return std::move(conv);
+}
+
+template <Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+ return instantiate_function<arm_compute::CLConvolutionLayer, arm_compute::CLTensor, Hint::OPENCL>(input, weights, biases, output, conv_info, weights_info);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+ return instantiate_function<arm_compute::NEConvolutionLayer, arm_compute::Tensor, Hint::NEON>(input, weights, biases, output, conv_info, weights_info);
+}
+} // namespace
+
+std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+{
+ if(_weights.tensor() == nullptr)
+ {
+ _weights.set_info(TensorInfo(TensorShape(_conv_width, _conv_height, input->info()->dimension(2), _ofm), input->info()->num_channels(), input->info()->data_type(),
+ input->info()->fixed_point_position()));
+ }
+ if(_biases.tensor() == nullptr)
+ {
+ _biases.set_info(TensorInfo(TensorShape(_ofm), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+ }
+
+ std::unique_ptr<arm_compute::IFunction> func;
+ _hint = hint;
+ _input = input;
+ _output = output;
+
+ if(_hint == Hint::OPENCL)
+ {
+ func = instantiate<Hint::OPENCL>(input, _weights, _biases, output, _conv_info, _weights_info);
+ }
+ else
+ {
+ func = instantiate<Hint::NEON>(input, _weights, _biases, output, _conv_info, _weights_info);
+ }
+
+ return func;
+}
+
+void ConvolutionLayer::print_info()
+{
+ if(_hint == Hint::OPENCL)
+ {
+ std::cout << "Instantiating CLConvolutionLayer";
+ }
+ else
+ {
+ std::cout << "Instantiating NEConvolutionLayer";
+ }
+ std::cout << " Type: " << _input->info()->data_type() << " Input Shape: " << _input->info()->tensor_shape() << " Weights shape: " << _weights.info().tensor_shape() << " Biases Shape: " <<
+ _biases.info().tensor_shape() << " Output Shape: " << _output->info()->tensor_shape() << " PadStrideInfo: " << _conv_info << "WeightsInfo: " << _weights_info << std::endl;
+}
diff --git a/src/graph/nodes/FullyConnectedLayer.cpp b/src/graph/nodes/FullyConnectedLayer.cpp
new file mode 100644
index 0000000..8d244cb
--- /dev/null
+++ b/src/graph/nodes/FullyConnectedLayer.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/FullyConnectedLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename FullyConnectedType, typename TensorType, Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
+{
+ bool weights_are_loaded = weights.tensor() != nullptr;
+ bool biases_are_loaded = biases.tensor() != nullptr;
+
+ auto conv = arm_compute::support::cpp14::make_unique<FullyConnectedType>();
+ conv->configure(
+ dynamic_cast<TensorType *>(input),
+ dynamic_cast<TensorType *>(weights.set_target(hint)),
+ dynamic_cast<TensorType *>(biases.set_target(hint)),
+ dynamic_cast<TensorType *>(output));
+ if(!weights_are_loaded)
+ {
+ weights.allocate_and_fill_if_needed();
+ }
+ if(!biases_are_loaded)
+ {
+ biases.allocate_and_fill_if_needed();
+ }
+
+ return std::move(conv);
+}
+
+template <Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
+{
+ return instantiate_function<arm_compute::CLFullyConnectedLayer, arm_compute::CLTensor, Hint::OPENCL>(input, weights, biases, output);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
+{
+ return instantiate_function<arm_compute::NEFullyConnectedLayer, arm_compute::Tensor, Hint::NEON>(input, weights, biases, output);
+}
+} // namespace
+
+std::unique_ptr<arm_compute::IFunction> FullyConnectedLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+{
+ if(_weights.tensor() == nullptr)
+ {
+ unsigned int num_weights = 1;
+ unsigned int num_dimensions = input->info()->num_dimensions();
+ // Ignore the batch dimension if there is one:
+ if(num_dimensions == 2 || num_dimensions == 4)
+ {
+ num_dimensions--;
+ }
+ for(unsigned int i = 0; i < num_dimensions; i++)
+ {
+ num_weights *= input->info()->dimension(i);
+ }
+ _weights.set_info(TensorInfo(TensorShape(num_weights, _num_neurons), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+ }
+ if(_biases.tensor() == nullptr)
+ {
+ _biases.set_info(TensorInfo(TensorShape(_num_neurons), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+ }
+
+ arm_compute::auto_init_if_empty(*output->info(), TensorShape(_num_neurons, input->info()->dimension(1)), input->info()->num_channels(), input->info()->data_type(),
+ input->info()->fixed_point_position());
+
+ std::unique_ptr<arm_compute::IFunction> func;
+ _hint = hint;
+ _input = input;
+ _output = output;
+
+ if(_hint == Hint::OPENCL)
+ {
+ func = instantiate<Hint::OPENCL>(input, _weights, _biases, output);
+ }
+ else
+ {
+ func = instantiate<Hint::NEON>(input, _weights, _biases, output);
+ }
+
+ return func;
+}
+
+void FullyConnectedLayer::print_info()
+{
+ if(_hint == Hint::OPENCL)
+ {
+ std::cout << "Instantiating CLFullyConnectedLayer";
+ }
+ else
+ {
+ std::cout << "Instantiating NEFullyConnectedLayer";
+ }
+ std::cout << " Type: " << _input->info()->data_type() << " Input Shape: " << _input->info()->tensor_shape() << " Weights shape: " << _weights.info().tensor_shape() << " Biases Shape: " <<
+ _biases.info().tensor_shape() << " Output Shape: " << _output->info()->tensor_shape() << std::endl;
+}
diff --git a/src/graph/nodes/PoolingLayer.cpp b/src/graph/nodes/PoolingLayer.cpp
new file mode 100644
index 0000000..f29332f
--- /dev/null
+++ b/src/graph/nodes/PoolingLayer.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/PoolingLayer.h"
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename PoolingType, typename TensorType, Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+{
+ auto pool = arm_compute::support::cpp14::make_unique<PoolingType>();
+ pool->configure(
+ dynamic_cast<TensorType *>(input),
+ dynamic_cast<TensorType *>(output),
+ pool_info);
+
+ return std::move(pool);
+}
+
+template <Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+{
+ return instantiate_function<arm_compute::CLPoolingLayer, arm_compute::CLTensor, Hint::OPENCL>(input, output, pool_info);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+{
+ return instantiate_function<arm_compute::NEPoolingLayer, arm_compute::Tensor, Hint::NEON>(input, output, pool_info);
+}
+} // namespace
+
+PoolingLayer::PoolingLayer(const PoolingLayerInfo pool_info)
+ : _pool_info(pool_info)
+{
+}
+
+std::unique_ptr<arm_compute::IFunction> PoolingLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+{
+ std::unique_ptr<arm_compute::IFunction> func;
+ _hint = hint;
+ _input = input;
+ _output = output;
+
+ if(_hint == Hint::OPENCL)
+ {
+ func = instantiate<Hint::OPENCL>(input, output, _pool_info);
+ }
+ else
+ {
+ func = instantiate<Hint::NEON>(input, output, _pool_info);
+ }
+
+ return func;
+}
+
+void PoolingLayer::print_info()
+{
+ if(_hint == Hint::OPENCL)
+ {
+ std::cout << "Instantiating CLPoolingLayer";
+ }
+ else
+ {
+ std::cout << "Instantiating NEPoolingLayer";
+ }
+
+ std::cout << " Data Type: " << _input->info()->data_type()
+ << " Input shape: " << _input->info()->tensor_shape()
+ << " Output shape: " << _output->info()->tensor_shape()
+ << " Pooling info: " << _pool_info << std::endl;
+}
diff --git a/src/graph/nodes/SoftmaxLayer.cpp b/src/graph/nodes/SoftmaxLayer.cpp
new file mode 100644
index 0000000..fee8897
--- /dev/null
+++ b/src/graph/nodes/SoftmaxLayer.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/SoftmaxLayer.h"
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename SoftmaxType, typename TensorType, Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output)
+{
+ auto softmax = arm_compute::support::cpp14::make_unique<SoftmaxType>();
+ softmax->configure(
+ dynamic_cast<TensorType *>(input),
+ dynamic_cast<TensorType *>(output));
+
+ return std::move(softmax);
+}
+
+template <Hint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, ITensor *output)
+{
+ return instantiate_function<arm_compute::CLSoftmaxLayer, arm_compute::CLTensor, Hint::OPENCL>(input, output);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, ITensor *output)
+{
+ return instantiate_function<arm_compute::NESoftmaxLayer, arm_compute::Tensor, Hint::NEON>(input, output);
+}
+} // namespace
+
+std::unique_ptr<arm_compute::IFunction> SoftmaxLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+{
+ std::unique_ptr<arm_compute::IFunction> func;
+ _hint = hint;
+ _input = input;
+ _output = output;
+
+ if(_hint == Hint::OPENCL)
+ {
+ func = instantiate<Hint::OPENCL>(input, output);
+ }
+ else
+ {
+ func = instantiate<Hint::NEON>(input, output);
+ }
+
+ return func;
+}
+
+void SoftmaxLayer::print_info()
+{
+ if(_hint == Hint::OPENCL)
+ {
+ std::cout << "Instantiating CLSoftmaxLayer";
+ }
+ else
+ {
+ std::cout << "Instantiating NESoftmaxLayer";
+ }
+ std::cout << " Data Type: " << _input->info()->data_type()
+ << " Input shape: " << _input->info()->tensor_shape()
+ << " Output shape: " << _output->info()->tensor_shape()
+ << std::endl;
+}
diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
new file mode 100644
index 0000000..50b0f0e
--- /dev/null
+++ b/src/runtime/Allocator.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Allocator.h"
+
+#include "arm_compute/core/Error.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+void *Allocator::allocate(size_t size, size_t alignment)
+{
+ ARM_COMPUTE_UNUSED(alignment);
+ return ::operator new(size);
+}
+
+void Allocator::free(void *ptr)
+{
+ ::operator delete(ptr);
+}
diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
new file mode 100644
index 0000000..69292b9
--- /dev/null
+++ b/src/runtime/BlobLifetimeManager.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/BlobLifetimeManager.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/BlobMemoryPool.h"
+#include "arm_compute/runtime/IAllocator.h"
+#include "arm_compute/runtime/IMemoryGroup.h"
+#include "support/ToolchainSupport.h"
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <vector>
+
+using namespace arm_compute;
+
+BlobLifetimeManager::BlobLifetimeManager()
+ : _active_group(nullptr), _active_elements(), _finalized_groups(), _blobs()
+{
+}
+
+void BlobLifetimeManager::register_group(IMemoryGroup *group)
+{
+ if(_active_group == nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON(group == nullptr);
+ _active_group = group;
+ }
+}
+
+void BlobLifetimeManager::start_lifetime(void *obj)
+{
+ ARM_COMPUTE_ERROR_ON(obj == nullptr);
+ ARM_COMPUTE_ERROR_ON_MSG(std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
+ {
+ return obj == e.id;
+ }) != std::end(_active_elements),
+ "Memory object is already registered!");
+
+ // Insert object in groups and mark its finalized state to false
+ _active_elements.emplace_back(obj);
+}
+
+void BlobLifetimeManager::end_lifetime(void *obj, void **handle, size_t size)
+{
+ ARM_COMPUTE_ERROR_ON(obj == nullptr);
+
+ // Find object
+ auto it = std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
+ {
+ return obj == e.id;
+ });
+ ARM_COMPUTE_ERROR_ON(it == std::end(_active_elements));
+
+ // Update object fields and mark object as complete
+ it->handle = handle;
+ it->size = size;
+ it->status = true;
+
+ // Check if all object are finalized and reset active group
+ if(are_all_finalized())
+ {
+ // Update finalized groups
+ _finalized_groups[_active_group].insert(std::end(_finalized_groups[_active_group]), std::begin(_active_elements), std::end(_active_elements));
+
+ // Update blobs and group mappings
+ update_blobs_and_mappings();
+
+ // Reset state
+ _active_elements.clear();
+ _active_group = nullptr;
+ }
+}
+
+std::unique_ptr<IMemoryPool> BlobLifetimeManager::create_pool(IAllocator *allocator)
+{
+ ARM_COMPUTE_ERROR_ON(allocator == nullptr);
+ return support::cpp14::make_unique<BlobMemoryPool>(allocator, _blobs);
+}
+
+bool BlobLifetimeManager::are_all_finalized() const
+{
+ return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const Element e)
+ {
+ return !e.status;
+ });
+}
+
+MappingType BlobLifetimeManager::mapping_type() const
+{
+ return MappingType::BLOBS;
+}
+
+void BlobLifetimeManager::update_blobs_and_mappings()
+{
+ ARM_COMPUTE_ERROR_ON(!are_all_finalized());
+ ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
+
+ // Sort active group requirements in descending order
+ std::sort(std::begin(_active_elements), std::end(_active_elements), [](const Element & a, const Element & b)
+ {
+ return a.size > b.size;
+ });
+ std::vector<size_t> group_sizes;
+ std::transform(std::begin(_active_elements), std::end(_active_elements), std::back_inserter(group_sizes), [](const Element & e)
+ {
+ return e.size;
+ });
+
+ // Update blob sizes
+ size_t max_size = std::max(_blobs.size(), group_sizes.size());
+ _blobs.resize(max_size, 0);
+ group_sizes.resize(max_size, 0);
+ std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](size_t lhs, size_t rhs)
+ {
+ return std::max(lhs, rhs);
+ });
+
+ // Calculate group mappings
+ auto &group_mappings = _active_group->mappings();
+ int blob_idx = 0;
+ for(auto &e : _active_elements)
+ {
+ group_mappings[e.handle] = blob_idx++;
+ }
+}
diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
new file mode 100644
index 0000000..29505e5
--- /dev/null
+++ b/src/runtime/BlobMemoryPool.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/BlobMemoryPool.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IAllocator.h"
+#include "arm_compute/runtime/IMemoryPool.h"
+#include "arm_compute/runtime/Types.h"
+#include "support/ToolchainSupport.h"
+
+#include <vector>
+
+using namespace arm_compute;
+
+BlobMemoryPool::BlobMemoryPool(IAllocator *allocator, std::vector<size_t> blob_sizes)
+ : _allocator(allocator), _blobs(), _blob_sizes(std::move(blob_sizes))
+{
+ ARM_COMPUTE_ERROR_ON(!allocator);
+ allocate_blobs(_blob_sizes);
+}
+
+BlobMemoryPool::~BlobMemoryPool()
+{
+ ARM_COMPUTE_ERROR_ON(!_allocator);
+ free_blobs();
+}
+
+void BlobMemoryPool::acquire(MemoryMappings &handles)
+{
+ // Set memory to handlers
+ for(auto &handle : handles)
+ {
+ ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
+ *handle.first = _blobs[handle.second];
+ }
+}
+
+void BlobMemoryPool::release(MemoryMappings &handles)
+{
+ for(auto &handle : handles)
+ {
+ ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
+ *handle.first = nullptr;
+ }
+}
+
+MappingType BlobMemoryPool::mapping_type() const
+{
+ return MappingType::BLOBS;
+}
+
+std::unique_ptr<IMemoryPool> BlobMemoryPool::duplicate()
+{
+ ARM_COMPUTE_ERROR_ON(!_allocator);
+ return support::cpp14::make_unique<BlobMemoryPool>(_allocator, _blob_sizes);
+}
+
+void BlobMemoryPool::allocate_blobs(const std::vector<size_t> &sizes)
+{
+ ARM_COMPUTE_ERROR_ON(!_allocator);
+
+ for(const auto &size : sizes)
+ {
+ _blobs.push_back(_allocator->allocate(size, 0));
+ }
+}
+
+void BlobMemoryPool::free_blobs()
+{
+ ARM_COMPUTE_ERROR_ON(!_allocator);
+
+ for(auto &blob : _blobs)
+ {
+ _allocator->free(blob);
+ }
+ _blobs.clear();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
new file mode 100644
index 0000000..9a5c13a
--- /dev/null
+++ b/src/runtime/CL/CLBufferAllocator.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLBufferAllocator.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+CLBufferAllocator::CLBufferAllocator(cl::Context context)
+ : _context(std::move(context))
+{
+}
+
+void *CLBufferAllocator::allocate(size_t size, size_t alignment)
+{
+ ARM_COMPUTE_UNUSED(alignment);
+ cl_mem buf = clCreateBuffer(_context.get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr);
+ return static_cast<void *>(buf);
+}
+
+void CLBufferAllocator::free(void *ptr)
+{
+ ARM_COMPUTE_ERROR_ON(ptr == nullptr);
+ clReleaseMemObject(static_cast<cl_mem>(ptr));
+}
diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp
index b9e8739..88d45ac 100644
--- a/src/runtime/CL/CLMultiHOG.cpp
+++ b/src/runtime/CL/CLMultiHOG.cpp
@@ -25,12 +25,12 @@
#include "arm_compute/core/CL/ICLHOG.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
CLMultiHOG::CLMultiHOG(size_t num_models)
- : _num_models(num_models), _model(arm_compute::cpp14::make_unique<CLHOG[]>(_num_models))
+ : _num_models(num_models), _model(arm_compute::support::cpp14::make_unique<CLHOG[]>(_num_models))
{
}
diff --git a/src/runtime/CL/CLPyramid.cpp b/src/runtime/CL/CLPyramid.cpp
index 41d81ea..865f389 100644
--- a/src/runtime/CL/CLPyramid.cpp
+++ b/src/runtime/CL/CLPyramid.cpp
@@ -24,10 +24,10 @@
#include "arm_compute/runtime/CL/CLPyramid.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PyramidInfo.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/TensorShape.h"
+#include "support/ToolchainSupport.h"
#include <array>
#include <cmath>
@@ -52,7 +52,7 @@
void CLPyramid::internal_init(const PyramidInfo &info, bool auto_padding)
{
_info = info;
- _pyramid = arm_compute::cpp14::make_unique<CLTensor[]>(_info.num_levels());
+ _pyramid = arm_compute::support::cpp14::make_unique<CLTensor[]>(_info.num_levels());
size_t w = _info.width();
size_t h = _info.height();
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index fe25ce5..71a749f 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -24,11 +24,12 @@
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLTuner.h"
using namespace arm_compute;
CLScheduler::CLScheduler()
- : _context(), _queue(), _target(GPUTarget::MIDGARD)
+ : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
{
}
@@ -40,10 +41,22 @@
void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
{
+ ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
+ "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
+ or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
+
+ // Tune the kernel if the CLTuner has been provided
+ if(_cl_tuner != nullptr)
+ {
+ // Tune the OpenCL kernel
+ _cl_tuner->tune_kernel(kernel);
+ }
+
+ // Run kernel
kernel.run(kernel.window(), _queue);
if(flush)
{
_queue.flush();
}
-}
+}
\ No newline at end of file
diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp
index eefa033..bc513d1 100644
--- a/src/runtime/CL/CLTensor.cpp
+++ b/src/runtime/CL/CLTensor.cpp
@@ -28,7 +28,7 @@
using namespace arm_compute;
CLTensor::CLTensor()
- : _allocator()
+ : _allocator(this)
{
}
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index 8112a71..ad165fa 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -25,15 +25,21 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
using namespace arm_compute;
-CLTensorAllocator::CLTensorAllocator()
- : _buffer(), _mapping(nullptr)
+CLTensorAllocator::CLTensorAllocator(CLTensor *owner)
+ : _associated_memory_group(nullptr), _buffer(), _mapping(nullptr), _owner(owner)
{
}
+CLTensorAllocator::~CLTensorAllocator()
+{
+ _buffer = cl::Buffer();
+}
+
uint8_t *CLTensorAllocator::data()
{
return _mapping;
@@ -47,17 +53,32 @@
void CLTensorAllocator::allocate()
{
ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
-
- _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size());
+ if(_associated_memory_group == nullptr)
+ {
+ _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size());
+ }
+ else
+ {
+ _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_buffer()), info().total_size());
+ }
info().set_is_resizable(false);
}
void CLTensorAllocator::free()
{
- ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+ if(_associated_memory_group == nullptr)
+ {
+ _buffer = cl::Buffer();
+ info().set_is_resizable(true);
+ }
+}
- _buffer = cl::Buffer();
- info().set_is_resizable(true);
+void CLTensorAllocator::set_associated_memory_group(CLMemoryGroup *associated_memory_group)
+{
+ ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
+ ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
+ ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+ _associated_memory_group = associated_memory_group;
}
uint8_t *CLTensorAllocator::lock()
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
new file mode 100644
index 0000000..7f5be86
--- /dev/null
+++ b/src/runtime/CL/CLTuner.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTuner.h"
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <chrono>
+#include <limits>
+#include <string>
+
+using namespace arm_compute;
+
+CLTuner::CLTuner()
+ : _lws_table()
+{
+}
+
+void CLTuner::tune_kernel(ICLKernel &kernel)
+{
+ // Get the configuration ID from the kernel
+ const std::string &config_id = kernel.config_id();
+
+ // Check if we need to find the Optimal LWS. If config_id is equal to default_config_id, the kernel does not require to be tuned
+ if(config_id != arm_compute::default_config_id)
+ {
+ auto p = _lws_table.find(config_id);
+
+ if(p == _lws_table.end())
+ {
+ // Find the optimal LWS for the kernel
+ cl::NDRange opt_lws = find_optimal_lws(kernel);
+
+ // Insert the optimal LWS in the table
+ _lws_table.emplace(config_id, opt_lws);
+
+ // Set Local-Workgroup-Size
+ kernel.set_lws_hint(opt_lws);
+ }
+ else
+ {
+ // Set Local-Workgroup-Size
+ kernel.set_lws_hint(p->second);
+ }
+ }
+}
+
+cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ double min_exec_time = std::numeric_limits<double>::max();
+
+ cl::NDRange opt_lws = cl::NDRange(1, 1);
+
+ for(int y = 1; y <= 16; ++y)
+ {
+ for(int x = 1; x <= 16; ++x)
+ {
+ cl::NDRange lws_test = cl::NDRange(x, y);
+
+ //Set the Local-Workgroup-Size
+ kernel.set_lws_hint(lws_test);
+
+ auto t_start = std::chrono::high_resolution_clock::now();
+
+ // Run
+ kernel.run(kernel.window(), q);
+
+ CLScheduler::get().sync();
+
+ auto t_stop = std::chrono::high_resolution_clock::now();
+
+ std::chrono::duration<double, std::nano> fp_nano = t_stop - t_start;
+
+ // Check the execution time
+ if(fp_nano.count() < min_exec_time)
+ {
+ min_exec_time = fp_nano.count();
+ opt_lws = cl::NDRange(x, y);
+ }
+ }
+ }
+
+ return opt_lws;
+}
+
+void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table)
+{
+ _lws_table.clear();
+ _lws_table = lws_table;
+}
+
+const std::unordered_map<std::string, cl::NDRange> &CLTuner::export_lws_table()
+{
+ return _lws_table;
+}
\ No newline at end of file
diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
index aa45743..a1a56fd 100644
--- a/src/runtime/CL/ICLSimpleFunction.cpp
+++ b/src/runtime/CL/ICLSimpleFunction.cpp
@@ -28,8 +28,9 @@
using namespace arm_compute;
-ICLSimpleFunction::ICLSimpleFunction()
- : _kernel(), _border_handler()
+ICLSimpleFunction::ICLSimpleFunction() // NOLINT
+ : _kernel(),
+ _border_handler()
{
}
diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
index 5097dd4..5613e6c 100644
--- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp
+++ b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLAbsoluteDifferenceKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLAbsoluteDifferenceKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
index 56c5199..78f25fc 100644
--- a/src/runtime/CL/functions/CLAccumulate.cpp
+++ b/src/runtime/CL/functions/CLAccumulate.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLAccumulate.h"
#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,21 +32,21 @@
void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum)
{
- auto k = arm_compute::cpp14::make_unique<CLAccumulateKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLAccumulateKernel>();
k->configure(input, accum);
_kernel = std::move(k);
}
void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
{
- auto k = arm_compute::cpp14::make_unique<CLAccumulateWeightedKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLAccumulateWeightedKernel>();
k->configure(input, alpha, accum);
_kernel = std::move(k);
}
void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
{
- auto k = arm_compute::cpp14::make_unique<CLAccumulateSquaredKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLAccumulateSquaredKernel>();
k->configure(input, shift, accum);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 9b5bd8b..b64739a 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -24,13 +24,13 @@
#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-void CLActivationLayer::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
{
- auto k = arm_compute::cpp14::make_unique<CLActivationLayerKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>();
k->configure(input, output, act_info);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
index 36bff42..5ca384d 100644
--- a/src/runtime/CL/functions/CLArithmeticAddition.cpp
+++ b/src/runtime/CL/functions/CLArithmeticAddition.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLArithmeticAddition::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
{
- auto k = arm_compute::cpp14::make_unique<CLArithmeticAdditionKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticAdditionKernel>();
k->configure(input1, input2, output, policy);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
index 97f0a1c..651f51a 100644
--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+++ b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
{
- auto k = arm_compute::cpp14::make_unique<CLArithmeticSubtractionKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionKernel>();
k->configure(input1, input2, output, policy);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index 3df673c..68cdaac 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -37,7 +37,7 @@
{
}
-void CLBatchNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
+void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
{
_norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
}
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index 7c85043..f8a5a85 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLBitwiseAndKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLBitwiseAndKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index 17ae5de..dc002e5 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLBitwiseNotKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLBitwiseNotKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index c84a279..4a10bb2 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLBitwiseOrKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLBitwiseOrKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index fd49c7d..d23622a 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLBitwiseXorKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLBitwiseXorKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
index 8de6807..f28be44 100644
--- a/src/runtime/CL/functions/CLBox3x3.cpp
+++ b/src/runtime/CL/functions/CLBox3x3.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLBox3x3.h"
#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,8 +33,8 @@
void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLBox3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLBox3x3Kernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 1d018b8..5acb8e7 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -26,17 +26,31 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-CLCannyEdge::CLCannyEdge()
- : _sobel(nullptr), _gradient(), _border_mag_gradient(), _non_max_suppr(), _edge_trace(), _gx(), _gy(), _mag(), _phase(), _nonmax(), _visited(), _recorded(), _l1_list_counter(), _l1_stack()
+CLCannyEdge::CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _sobel(),
+ _gradient(),
+ _border_mag_gradient(),
+ _non_max_suppr(),
+ _edge_trace(),
+ _gx(),
+ _gy(),
+ _mag(),
+ _phase(),
+ _nonmax(),
+ _visited(),
+ _recorded(),
+ _l1_list_counter(),
+ _l1_stack()
{
}
@@ -83,22 +97,26 @@
TensorInfo info_s32(shape_l1_stack, 1, arm_compute::DataType::S32);
_l1_stack.allocator()->init(info_s32);
+ // Manage intermediate buffers
+ _memory_group.manage(&_gx);
+ _memory_group.manage(&_gy);
+
// Configure/Init sobelNxN
if(gradient_size == 3)
{
- auto k = arm_compute::cpp14::make_unique<CLSobel3x3>();
+ auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
}
else if(gradient_size == 5)
{
- auto k = arm_compute::cpp14::make_unique<CLSobel5x5>();
+ auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
}
else if(gradient_size == 7)
{
- auto k = arm_compute::cpp14::make_unique<CLSobel7x7>();
+ auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
}
@@ -107,23 +125,43 @@
ARM_COMPUTE_ERROR("Gradient %d size not supported", gradient_size);
}
+ // Manage intermediate buffers
+ _memory_group.manage(&_mag);
+ _memory_group.manage(&_phase);
+
// Configure gradient
_gradient.configure(&_gx, &_gy, &_mag, &_phase, norm_type);
+ // Allocate intermediate buffers
+ _gx.allocator()->allocate();
+ _gy.allocator()->allocate();
+
+ // Manage intermediate buffers
+ _memory_group.manage(&_nonmax);
+
// Configure non-maxima suppression
_non_max_suppr.configure(&_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
+ // Allocate intermediate buffers
+ _phase.allocator()->allocate();
+
// Fill border around magnitude image as non-maxima suppression will access
// it. If border mode is undefined filling the border is a nop.
_border_mag_gradient.configure(&_mag, _non_max_suppr.border_size(), border_mode, constant_border_value);
+ // Allocate intermediate buffers
+ _mag.allocator()->allocate();
+
+ // Manage intermediate buffers
+ _memory_group.manage(&_visited);
+ _memory_group.manage(&_recorded);
+ _memory_group.manage(&_l1_stack);
+ _memory_group.manage(&_l1_list_counter);
+
// Configure edge tracing
_edge_trace.configure(&_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
- _gx.allocator()->allocate();
- _gy.allocator()->allocate();
- _phase.allocator()->allocate();
- _mag.allocator()->allocate();
+ // Allocate intermediate buffers
_visited.allocator()->allocate();
_recorded.allocator()->allocate();
_l1_stack.allocator()->allocate();
@@ -133,6 +171,8 @@
void CLCannyEdge::run()
{
+ _memory_group.acquire();
+
// Run sobel
_sobel->run();
@@ -152,4 +192,6 @@
_l1_list_counter.clear(CLScheduler::get().queue());
_l1_stack.clear(CLScheduler::get().queue());
CLScheduler::get().enqueue(_edge_trace, true);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
index 79a3676..11605cf 100644
--- a/src/runtime/CL/functions/CLChannelCombine.cpp
+++ b/src/runtime/CL/functions/CLChannelCombine.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,14 +32,14 @@
void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLChannelCombineKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>();
k->configure(plane0, plane1, plane2, plane3, output);
_kernel = std::move(k);
}
void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
{
- auto k = arm_compute::cpp14::make_unique<CLChannelCombineKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>();
k->configure(plane0, plane1, plane2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
index 2c6174b..5090382 100644
--- a/src/runtime/CL/functions/CLChannelExtract.cpp
+++ b/src/runtime/CL/functions/CLChannelExtract.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,14 +32,14 @@
void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLChannelExtractKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>();
k->configure(input, channel, output);
_kernel = std::move(k);
}
void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
{
- auto k = arm_compute::cpp14::make_unique<CLChannelExtractKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>();
k->configure(input, channel, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
index 2fe465a..65f8ac3 100644
--- a/src/runtime/CL/functions/CLColorConvert.cpp
+++ b/src/runtime/CL/functions/CLColorConvert.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLColorConvert.h"
#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,28 +32,28 @@
void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output)
{
- auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output)
{
- auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output)
{
- auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index 21b5d47..a9b0867 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -26,13 +26,13 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/ITensorAllocator.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -40,15 +40,15 @@
void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLConvolution3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLConvolution3x3Kernel>();
k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
}
template <unsigned int matrix_size>
-CLConvolutionSquare<matrix_size>::CLConvolutionSquare()
- : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+CLConvolutionSquare<matrix_size>::CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
{
}
@@ -66,6 +66,9 @@
std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size);
_tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp);
+
if(scale == 0)
{
scale = calculate_matrix_scale(conv, matrix_size);
@@ -92,8 +95,12 @@
if(_is_separable)
{
+ _memory_group.acquire();
+
CLScheduler::get().enqueue(_kernel_hor, false);
CLScheduler::get().enqueue(_kernel_vert);
+
+ _memory_group.release();
}
else
{
@@ -107,7 +114,7 @@
void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLConvolutionRectangleKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLConvolutionRectangleKernel>();
k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index f0bbc35..4b1bfd8 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -24,32 +24,31 @@
#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include <cmath>
+#include <memory>
#include <tuple>
using namespace arm_compute;
-CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
- : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
{
}
void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
if(biases != nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
@@ -65,10 +64,12 @@
const unsigned int mat_weights_cols = weights->info()->dimension(3);
const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
- const DataType dt = weights->info()->data_type();
- TensorInfo info_wr(shape_wr, 1, dt);
+ const DataType dt = weights->info()->data_type();
+ const int fixed_point_position = weights->info()->fixed_point_position();
+ TensorInfo info_wr(shape_wr, 1, dt, fixed_point_position);
_weights_reshaped.allocator()->init(info_wr);
+ _memory_group.manage(&_weights_reshaped);
_weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
_weights_transposed_kernel.configure(&_weights_reshaped, output);
_weights_reshaped.allocator()->allocate();
@@ -81,41 +82,50 @@
void CLConvolutionLayerReshapeWeights::run()
{
+ _memory_group.acquire();
+
cl::CommandQueue q = CLScheduler::get().queue();
CLScheduler::get().enqueue(_weights_reshape_kernel);
if(_transpose1xW)
{
CLScheduler::get().enqueue(_weights_transposed_kernel);
}
+
+ _memory_group.release();
}
-CLConvolutionLayer::CLConvolutionLayer()
- : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
- _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
+ _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
{
}
void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
if(biases != nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
}
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
+ // Set the GPU target for matrix multiply
+ _mm_kernel.set_target(CLScheduler::get().target());
+
_has_bias = (biases != nullptr);
_are_weights_reshaped = weights_info.are_reshaped();
- // Get parameters for conv_info
+ // Get parameters from conv_info
unsigned int stride_x = 0;
unsigned int stride_y = 0;
unsigned int pad_x = 0;
@@ -127,20 +137,21 @@
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- const unsigned int kernel_width = _are_weights_reshaped ? weights_info.kernel_size() : weights->info()->dimension(0);
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
- stride_x, stride_y, pad_x, pad_y, conv_info.round());
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+ const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0);
+ const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1);
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+ conv_info);
// Check if its a "fully connected" convolution
_is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
- // Create tensor to store the reshaped weights
- size_t mat_weights_cols = weights->info()->dimension(3);
- size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+ unsigned int mat_weights_cols = weights->info()->dimension(3);
+ unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+
+ // Reshape weights if needed
if(_are_weights_reshaped)
{
- mat_weights_cols = output->info()->dimension(2);
+ mat_weights_cols = weights_info.num_kernels();
const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
mat_weights_rows = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
}
@@ -150,77 +161,75 @@
{
// Create tensor to store the reshaped weights
TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
- TensorInfo info_wr(shape_wr, 1, weights->info()->data_type());
+ TensorInfo info_wr(shape_wr, 1, dt, fixed_point_position);
_weights_reshaped.allocator()->init(info_wr);
- _reshape_weights.configure(weights, biases, &_weights_reshaped, false);
- weights = &_weights_reshaped;
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
}
else
{
// Create tensor to store transposed weights
- TensorShape shape_wt(mat_weights_rows * 4, static_cast<size_t>(std::ceil(mat_weights_cols / 4.f)));
- TensorInfo info_wt(shape_wt, 1, weights->info()->data_type());
- _weights_transposed.allocator()->init(info_wt);
- _reshape_weights.configure(weights, biases, &_weights_transposed, true);
- weights = &_weights_transposed;
+ const float transpose_width = 16.0f / input->info()->element_size();
+ TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _weights_reshaped.allocator()->init(info_wt);
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
}
+ weights = &_weights_reshaped;
}
+
// Create tensor to store im2col reshaped inputs
- const size_t mat_input_cols = mat_weights_rows;
- const size_t mat_input_rows = conv_w * conv_h;
- TensorShape shape_im2col = input->info()->tensor_shape();
+ const unsigned int mat_input_cols = mat_weights_rows;
+ const unsigned int mat_input_rows = conv_w * conv_h;
+ TensorShape shape_im2col = input->info()->tensor_shape();
shape_im2col.set(0, mat_input_cols);
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
- _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+ _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+ _memory_group.manage(&_input_im2col_reshaped);
// Create tensor (interleave) to prepare input tensor for GEMM
if(!_is_fully_connected_convolution)
{
TensorShape shape_interleaved = shape_im2col;
shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4.f));
- _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+ shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+ _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+ _memory_group.manage(&_input_interleaved_reshaped);
}
// Create GEMM output tensor
TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, mat_input_rows);
- _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+ _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
+ _memory_group.manage(&_gemm_output);
// Configure kernels
- _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
- _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
+ // Configure matrix multiply
if(_is_fully_connected_convolution)
{
- _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+ // The matrix A and Matrix B have not been reshaped
+ _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f, false);
}
else
{
_input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
_mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
- }
-
- if(!_are_weights_reshaped)
- {
- if(!_is_fully_connected_convolution)
- {
- _weights_transposed.allocator()->allocate();
- }
- else
- {
- _weights_reshaped.allocator()->allocate();
- }
- }
-
- _input_im2col_reshaped.allocator()->allocate();
- if(!_is_fully_connected_convolution)
- {
_input_interleaved_reshaped.allocator()->allocate();
}
+ _input_im2col_reshaped.allocator()->allocate();
+ _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
_gemm_output.allocator()->allocate();
+
+ ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+ // Allocate intermediate tensor
+ if(!_are_weights_reshaped)
+ {
+ _weights_reshaped.allocator()->allocate();
+ }
}
void CLConvolutionLayer::run()
@@ -232,6 +241,8 @@
_reshape_weights.run();
}
+ _memory_group.acquire();
+
// Run input reshaping
CLScheduler::get().enqueue(_input_im2col_kernel);
if(!_is_fully_connected_convolution)
@@ -244,4 +255,6 @@
// Reshape output matrix
CLScheduler::get().enqueue(_output_col2im_kernel, false);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp
index d967d98..f42627f 100644
--- a/src/runtime/CL/functions/CLDepthConcatenate.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenate.cpp
@@ -24,22 +24,23 @@
#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-CLDepthConcatenate::CLDepthConcatenate()
- : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+CLDepthConcatenate::CLDepthConcatenate() // NOLINT
+ : _inputs_vector(),
+ _concat_kernels_vector(),
+ _border_handlers_vector(),
+ _num_inputs(0)
{
}
-void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output)
+void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
{
ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
@@ -47,8 +48,8 @@
unsigned int depth_offset = 0;
- _concat_kernels_vector = arm_compute::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
- _border_handlers_vector = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+ _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
+ _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
for(unsigned int i = 0; i < _num_inputs; i++)
{
diff --git a/src/runtime/CL/functions/CLDepthConvert.cpp b/src/runtime/CL/functions/CLDepthConvert.cpp
index edcd492..b64d05b 100644
--- a/src/runtime/CL/functions/CLDepthConvert.cpp
+++ b/src/runtime/CL/functions/CLDepthConvert.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLDepthConvert::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
{
- auto k = arm_compute::cpp14::make_unique<CLDepthConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertKernel>();
k->configure(input, output, policy, shift);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolution.cpp b/src/runtime/CL/functions/CLDepthwiseConvolution.cpp
new file mode 100644
index 0000000..22c037f
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthwiseConvolution.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDepthwiseConvolution3x3::CLDepthwiseConvolution3x3()
+ : _kernel(), _border_handler()
+{
+}
+
+void CLDepthwiseConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+ _kernel.configure(input, output, weights, conv_info);
+ _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+void CLDepthwiseConvolution3x3::run()
+{
+ CLScheduler::get().enqueue(_border_handler);
+ CLScheduler::get().enqueue(_kernel);
+}
+
+CLDepthwiseConvolution::CLDepthwiseConvolution()
+ : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), _weights_reshaped(),
+ _v2mm_output()
+{
+}
+
+void CLDepthwiseConvolution::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
+
+ const size_t weights_w = weights->info()->dimension(0);
+ const size_t weights_h = weights->info()->dimension(1);
+ const size_t weights_z = weights->info()->dimension(2);
+
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
+
+ // Set up intermediate tensors
+ const size_t patch_size = weights_w * weights_h;
+ const size_t conv_size = conv_w * conv_h;
+
+ TensorShape shape_im2col = input->info()->tensor_shape();
+ shape_im2col.set(0, patch_size);
+ shape_im2col.set(1, conv_size);
+ shape_im2col.set(2, weights_z);
+
+ const TensorShape shape_weights_reshape(patch_size, weights_z);
+ TensorShape shape_v2mm_out = output->info()->tensor_shape();
+ shape_v2mm_out.set(0, conv_size * weights_z);
+ shape_v2mm_out.set(1, 1);
+ shape_v2mm_out.set(2, 1);
+
+ const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
+ const TensorInfo info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+ const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ _input_reshaped.allocator()->init(info_im2col);
+ _weights_reshaped.allocator()->init(info_weights_reshape);
+ _v2mm_output.allocator()->init(info_v2mm_out);
+
+ // Configure kernels
+ _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info);
+ _weights_reshape_kernel.configure(weights, &_weights_reshaped);
+ _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+ _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
+
+ BorderSize border_size = _v2mm_kernel.border_size();
+ _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
+
+ border_size.bottom = 0;
+ _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
+
+ // Allocate intermediate tensors
+ _input_reshaped.allocator()->allocate();
+ _weights_reshaped.allocator()->allocate();
+ _v2mm_output.allocator()->allocate();
+}
+
+void CLDepthwiseConvolution::run()
+{
+ CLScheduler::get().enqueue(_im2col_kernel);
+
+ CLScheduler::get().enqueue(_weights_reshape_kernel);
+
+ CLScheduler::get().enqueue(_v2mm_input_fill_border);
+ CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+ CLScheduler::get().enqueue(_v2mm_kernel);
+
+ CLScheduler::get().enqueue(_vector_to_tensor_kernel);
+}
diff --git a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
new file mode 100644
index 0000000..c325b3e
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDepthwiseSeparableConvolutionLayer::CLDepthwiseSeparableConvolutionLayer()
+ : _depthwise_conv(), _pointwise_conv()
+{
+}
+
+void CLDepthwiseSeparableConvolutionLayer::configure(ICLTensor *input, const ICLTensor *depthwise_weights, ICLTensor *depthwise_out, const ICLTensor *pointwise_weights, const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
+{
+ _depthwise_conv.configure(input, depthwise_out, depthwise_weights, depthwise_conv_info);
+ _pointwise_conv.configure(depthwise_out, pointwise_weights, biases, output, pointwise_conv_info);
+}
+
+void CLDepthwiseSeparableConvolutionLayer::run()
+{
+ _depthwise_conv.run();
+ _pointwise_conv.run();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
new file mode 100644
index 0000000..5559d42
--- /dev/null
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDequantizationLayer::CLDequantizationLayer()
+ : _dequantize_kernel()
+{
+}
+
+void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
+{
+ _dequantize_kernel.configure(input, output, min_max);
+}
+
+void CLDequantizationLayer::run()
+{
+ // Run dequantization kernel
+ CLScheduler::get().enqueue(_dequantize_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
index c51cb4c..ae49996 100644
--- a/src/runtime/CL/functions/CLDerivative.cpp
+++ b/src/runtime/CL/functions/CLDerivative.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLDerivative.h"
#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,8 +33,8 @@
void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLDerivativeKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLDerivativeKernel>();
k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
index 345f477..59c5ea5 100644
--- a/src/runtime/CL/functions/CLDilate.cpp
+++ b/src/runtime/CL/functions/CLDilate.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLDilate.h"
#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,8 +33,8 @@
void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLDilateKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLDilateKernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
new file mode 100644
index 0000000..6fafd9c
--- /dev/null
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDirectConvolutionLayer::CLDirectConvolutionLayer()
+ : _direct_conv_kernel(), _input_border_handler()
+{
+}
+
+void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+ // Set GPU target
+ _direct_conv_kernel.set_target(CLScheduler::get().target());
+
+ // Configure direct convolution
+ _direct_conv_kernel.configure(input, weights, biases, output, conv_info);
+
+ // Configure border handler
+ _input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+void CLDirectConvolutionLayer::run()
+{
+ // Run border handler
+ CLScheduler::get().enqueue(_input_border_handler, false);
+
+ // Run direct convolution
+ CLScheduler::get().enqueue(_direct_conv_kernel);
+}
diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
index b4c50e4..eb1f6e4 100644
--- a/src/runtime/CL/functions/CLErode.cpp
+++ b/src/runtime/CL/functions/CLErode.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLErode.h"
#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,8 +33,8 @@
void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLErodeKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLErodeKernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
index d2903fb..7a0dd09 100644
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ b/src/runtime/CL/functions/CLFastCorners.cpp
@@ -36,8 +36,9 @@
using namespace arm_compute;
-CLFastCorners::CLFastCorners()
- : _fast_corners_kernel(),
+CLFastCorners::CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)),
+ _fast_corners_kernel(),
_suppr_func(),
_copy_array_kernel(),
_output(),
@@ -70,6 +71,7 @@
const bool update_number = (nullptr != _num_corners);
+ _memory_group.manage(&_output);
_fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, border_mode);
if(!_non_max)
@@ -79,6 +81,7 @@
else
{
_suppr.allocator()->init(tensor_info);
+ _memory_group.manage(&_suppr);
_suppr_func.configure(&_output, &_suppr, border_mode);
_copy_array_kernel.configure(&_suppr, update_number, corners, &_num_buffer);
@@ -94,6 +97,8 @@
{
cl::CommandQueue q = CLScheduler::get().queue();
+ _memory_group.acquire();
+
if(_non_max)
{
ARM_COMPUTE_ERROR_ON_MSG(_output.cl_buffer().get() == nullptr, "Unconfigured function");
@@ -124,4 +129,6 @@
}
q.flush();
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
index 9e59b77..54c096e 100644
--- a/src/runtime/CL/functions/CLFillBorder.cpp
+++ b/src/runtime/CL/functions/CLFillBorder.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLFillBorder.h"
#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLFillBorderKernel>();
- k->configure(tensor, border_width, border_mode, constant_border_value);
+ auto k = arm_compute::support::cpp14::make_unique<CLFillBorderKernel>();
+ k->configure(tensor, BorderSize(border_width), border_mode, constant_border_value);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
new file mode 100644
index 0000000..9f571b2
--- /dev/null
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/Size2D.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLIm2ColKernel>();
+ k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+ _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
new file mode 100644
index 0000000..364db34
--- /dev/null
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFloor.h"
+
+#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLFloor::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLFloorKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 57d57d5..ee1558f 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -23,88 +23,31 @@
*/
#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
#include <algorithm>
-#include <cmath>
using namespace arm_compute;
-CLFullyConnectedLayerReshapeWeights::CLFullyConnectedLayerReshapeWeights()
- : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
+
+CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(),
+ _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
{
}
-void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer)
+void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON(output == nullptr);
- ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
- ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
-
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
-
- _transpose_weights = transpose_weights;
- _is_batched_fc_layer = is_batched_fc_layer;
-
- // Check if we need to transpose the weights
- if(_transpose_weights)
- {
- if(_is_batched_fc_layer)
- {
- // Initialize the output tensor for transpose
- TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
- _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
- _transpose_kernel.configure(input, &_transpose_output);
-
- // Configure transpose 1xW kernel
- _transpose1xW_kernel.configure(&_transpose_output, output);
-
- // Allocate temporary tensor used for transposing the weights
- _transpose_output.allocator()->allocate();
- }
- else
- {
- _transpose_kernel.configure(input, output);
- }
- }
- else
- {
- if(_is_batched_fc_layer)
- {
- // Configure transpose 1xW kernel
- _transpose1xW_kernel.configure(input, output);
- }
- else
- {
- ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
- }
- }
-}
-
-void CLFullyConnectedLayerReshapeWeights::run()
-{
- if(_transpose_weights)
- {
- CLScheduler::get().enqueue(_transpose_kernel, _is_batched_fc_layer);
- }
- if(_is_batched_fc_layer)
- {
- CLScheduler::get().enqueue(_transpose1xW_kernel);
- }
-}
-
-CLFullyConnectedLayer::CLFullyConnectedLayer()
- : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
- _are_weights_reshaped(true), _is_fc_after_conv(true), _is_batched_fc_layer(false), _accumulate_biases(false)
-{
-}
-
-void CLFullyConnectedLayer::configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+ ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
const DataType dt = input->info()->data_type();
const int fixed_point_position = input->info()->fixed_point_position();
@@ -119,93 +62,33 @@
shape_im2col.set(3, input->info()->dimension(5));
_im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
- // Initialize output tensor for interleave 4x4
- TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
-
// Configure im2col kernel
- _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
- // Configure interleave4x4 kernel
- _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
+ _memory_group.manage(&_im2col_output);
+ _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
// Configure matrix multiply kernel
- _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
-
- // Allocate the tensors once all the configure methods have been called
- _im2col_output.allocator()->allocate();
- _interleave4x4_output.allocator()->allocate();
-}
-
-void CLFullyConnectedLayer::configure_fc_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
-{
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
-
- // Initialize output tensor for interleave 4x4
- TensorShape shape_interleaved = input->info()->tensor_shape();
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
-
- // Configure interleave4x4 kernel
- _interleave4x4_kernel.configure(input, &_interleave4x4_output);
-
- // Configure matrix multiply kernel
- _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
-
- // Allocate the tensors once all the configure methods have been called
- _interleave4x4_output.allocator()->allocate();
-}
-
-void CLFullyConnectedLayer::configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
-
- // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
- // Initialize output tensor for im2col
- TensorShape shape_im2col;
- shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
- shape_im2col.set(1, 1);
- _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
-
- // Configure im2col kernel
- _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
- // Configure matrix multiply kernel
- _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
+ _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false);
// Allocate the output tensor for im2col once all the configure methods have been called
_im2col_output.allocator()->allocate();
}
-void CLFullyConnectedLayer::configure_fc_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
// Configure matrix multiply kernel
- _mm_kernel.configure(input, weights, output, 1.0f);
+ _mm_kernel.configure(input, weights, output, 1.0f, false);
}
void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
-
- _are_weights_reshaped = are_weights_reshaped;
+ _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true;
_is_fc_after_conv = true;
- _is_batched_fc_layer = false;
_accumulate_biases = false;
if(biases != nullptr)
@@ -224,90 +107,46 @@
// 3) Convolution layer -> Fully Connected layer with batches
// 4) Fully Connected layer -> Fully Connected layer with batches
- // Check if we have a fully connected layer with batches
- _is_batched_fc_layer = (output->info()->dimension(1) > 1);
-
const ICLTensor *weights_to_use = weights;
- if(!are_weights_reshaped)
+ if(!_are_weights_reshaped)
{
- if((transpose_weights || _is_batched_fc_layer))
- {
- weights_to_use = &_reshape_weights_output;
+ weights_to_use = &_reshape_weights_output;
- if(transpose_weights)
- {
- if(_is_batched_fc_layer)
- {
- const float transpose_width = 16.0f / input->info()->element_size();
- TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
- TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
- _reshape_weights_output.allocator()->init(info_wt);
- }
- else
- {
- TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
- TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
- _reshape_weights_output.allocator()->init(info_wt);
- }
- }
- else
- {
- ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
-
- const float transpose_width = 16.0f / input->info()->element_size();
- TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
- TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
- _reshape_weights_output.allocator()->init(info_wt);
- }
-
- // Reshape the weights
- _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
- }
+ // Reshape the weights
+ _reshape_weights_kernel.configure(weights, &_reshape_weights_output);
}
- if(_is_batched_fc_layer)
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+
+ if(is_batched_fc_layer)
{
_is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
input->info()->tensor_shape().cend(),
output->info()->tensor_shape().cbegin() + 1));
-
- if(_is_fc_after_conv)
- {
- // Fully Connected layer after a Convolution Layer with batches
- configure_conv_fc_wb(input, weights_to_use, output);
- }
- else
- {
- // Fully Connected layer after a Fully Connected Layer with batches
- configure_fc_fc_wb(input, weights_to_use, output);
- }
}
else
{
- // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
- _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+ _is_fc_after_conv = input->info()->num_dimensions() > 1;
+ }
- if(_is_fc_after_conv)
- {
- // Fully Connected layer after a Convolution Layer without batches
- configure_conv_fc_nb(input, weights_to_use, output);
- }
- else
- {
- // Fully Connected layer after a Fully Connected Layer without batches
- configure_fc_fc_nb(input, weights_to_use, output);
- }
+ if(_is_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer without batches
+ configure_conv_fc(input, weights_to_use, output);
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer without batches
+ configure_fc_fc(input, weights_to_use, output);
}
// Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
- if(!are_weights_reshaped)
+ if(!_are_weights_reshaped)
{
- if(transpose_weights || _is_batched_fc_layer)
- {
- // Allocate the tensor for the weights reshaped
- _reshape_weights_output.allocator()->allocate();
- }
+ // Allocate the tensor for the weights reshaped
+ _reshape_weights_output.allocator()->allocate();
}
}
@@ -320,18 +159,14 @@
_reshape_weights_kernel.run();
}
+ _memory_group.acquire();
+
// Linearize input if it comes from a convolutional layer
if(_is_fc_after_conv)
{
CLScheduler::get().enqueue(_im2col_kernel, false);
}
- // Interleave input
- if(_is_batched_fc_layer)
- {
- CLScheduler::get().enqueue(_interleave4x4_kernel, false);
- }
-
// Run matrix multiply
CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
@@ -340,4 +175,6 @@
{
CLScheduler::get().enqueue(_accumulate_biases_kernel);
}
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 7408054..a81d113 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -38,20 +38,18 @@
using namespace arm_compute;
-CLGEMM::CLGEMM()
- : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
{
}
void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
if(c != nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
@@ -59,13 +57,18 @@
ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
}
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
- if(a->info()->dimension(1) != 1)
+ // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
+ _is_interleaved_transposed = a->info()->dimension(1) > 16;
+
+ const ICLTensor *matrix_a = a;
+ const ICLTensor *matrix_b = b;
+
+ if(_is_interleaved_transposed)
{
- _run_vector_matrix_multiplication = false;
+ matrix_a = &_tmp_a;
+ matrix_b = &_tmp_b;
TensorShape shape_tmp_a = a->info()->tensor_shape();
TensorShape shape_tmp_b = b->info()->tensor_shape();
@@ -73,27 +76,20 @@
shape_tmp_a.set(0, a->info()->dimension(0) * 4);
shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
- if(DataType::F32 == a->info()->data_type())
- {
- shape_tmp_b.set(0, b->info()->dimension(1) * 4);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.0f));
- }
- else if(DataType::F16 == a->info()->data_type())
- {
- shape_tmp_b.set(0, b->info()->dimension(1) * 8);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 8.0f));
- }
- else
- {
- ARM_COMPUTE_ERROR("DataType not supported");
- }
+ const unsigned int transpose_w = max_cl_vector_width / data_size_from_type(b->info()->data_type());
+ shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
+ shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+ TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
_tmp_a.allocator()->init(info_a);
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+ TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), b->info()->fixed_point_position());
_tmp_b.allocator()->init(info_b);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp_a);
+ _memory_group.manage(&_tmp_b);
+
// Configure interleave kernel
_interleave_kernel.configure(a, &_tmp_a);
@@ -101,19 +97,17 @@
_transpose_kernel.configure(b, &_tmp_b);
// Configure matrix multiply kernel
- _mm_kernel.configure(&_tmp_a, &_tmp_b, output, alpha);
+ _mm_kernel.set_target(CLScheduler::get().target());
+ }
+ _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
+
+ if(_is_interleaved_transposed)
+ {
// Allocate intermediate tensors
_tmp_a.allocator()->allocate();
_tmp_b.allocator()->allocate();
}
- else // The first input tensor is a vector
- {
- _run_vector_matrix_multiplication = true;
-
- // Configure the matrix multiply kernel
- _mm_kernel.configure(a, b, output, alpha);
- }
// Configure matrix addition kernel
if(beta != 0 && c != nullptr)
@@ -125,7 +119,9 @@
void CLGEMM::run()
{
- if(!_run_vector_matrix_multiplication)
+ _memory_group.acquire();
+
+ if(_is_interleaved_transposed)
{
// Run interleave kernel
CLScheduler::get().enqueue(_interleave_kernel, false);
@@ -142,4 +138,6 @@
{
CLScheduler::get().enqueue(_ma_kernel);
}
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
index 9dc7715..45547e4 100644
--- a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
+++ b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
@@ -24,13 +24,13 @@
#include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
void CLGEMMInterleave4x4::configure(const ICLTensor *input, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLGEMMInterleave4x4Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMInterleave4x4Kernel>();
k->configure(input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp
index 45e011d..db6d11c 100644
--- a/src/runtime/CL/functions/CLGEMMLowp.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowp.cpp
@@ -33,8 +33,8 @@
using namespace arm_compute;
-CLGEMMLowp::CLGEMMLowp()
- : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+CLGEMMLowp::CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
{
}
@@ -62,6 +62,10 @@
TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
_tmp_b.allocator()->init(info_b);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp_a);
+ _memory_group.manage(&_tmp_b);
+
// Configure kernels
_interleave_kernel.configure(a, &_tmp_a);
_transpose_kernel.configure(b, &_tmp_b);
@@ -74,6 +78,8 @@
void CLGEMMLowp::run()
{
+ _memory_group.acquire();
+
/* Run interleave kernel */
CLScheduler::get().enqueue(_interleave_kernel, false);
@@ -82,4 +88,6 @@
/* Run matrix multiply kernel */
CLScheduler::get().enqueue(_mm_kernel, false);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp b/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp
new file mode 100644
index 0000000..d054e01
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLGEMMTranspose1xW::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMTranspose1xWKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
index 362a3fe..7ebabd7 100644
--- a/src/runtime/CL/functions/CLGaussian3x3.cpp
+++ b/src/runtime/CL/functions/CLGaussian3x3.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLGaussian3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLGaussian3x3Kernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
index e83a8fb..f30eee1 100644
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp
@@ -35,8 +35,8 @@
using namespace arm_compute;
-CLGaussian5x5::CLGaussian5x5()
- : _kernel_hor(), _kernel_vert(), _border_handler(), _tmp()
+CLGaussian5x5::CLGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _border_handler(), _tmp()
{
}
@@ -46,6 +46,10 @@
_tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, DataType::U16));
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp);
+
+ // Configure kernels
_kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
_kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
_border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
@@ -57,6 +61,11 @@
void CLGaussian5x5::run()
{
CLScheduler::get().enqueue(_border_handler, false);
+
+ _memory_group.acquire();
+
CLScheduler::get().enqueue(_kernel_hor, false);
CLScheduler::get().enqueue(_kernel_vert);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index 8a4279e..8436dce 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -27,11 +27,11 @@
#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
#include "arm_compute/runtime/CL/CLPyramid.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
@@ -48,8 +48,10 @@
{
}
-CLGaussianPyramidHalf::CLGaussianPyramidHalf()
- : _border_handler(), _horizontal_reduction(), _vertical_reduction()
+CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
+ : _border_handler(),
+ _horizontal_reduction(),
+ _vertical_reduction()
{
}
@@ -70,9 +72,9 @@
if(num_levels > 1)
{
- _border_handler = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
- _horizontal_reduction = arm_compute::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
- _vertical_reduction = arm_compute::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+ _border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+ _horizontal_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
+ _vertical_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
// Apply half scale to the X dimension of the tensor shape
TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -119,8 +121,9 @@
}
}
-CLGaussianPyramidOrb::CLGaussianPyramidOrb()
- : _gauss5x5(), _scale_nearest()
+CLGaussianPyramidOrb::CLGaussianPyramidOrb() // NOLINT
+ : _gauss5x5(),
+ _scale_nearest()
{
}
@@ -141,8 +144,8 @@
if(num_levels > 1)
{
- _gauss5x5 = arm_compute::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
- _scale_nearest = arm_compute::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
+ _gauss5x5 = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
+ _scale_nearest = arm_compute::support::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index b1b5a03..1470d5c 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -31,8 +31,8 @@
using namespace arm_compute;
-CLHOGDescriptor::CLHOGDescriptor()
- : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
{
}
@@ -71,9 +71,16 @@
TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
_hog_space.allocator()->init(info_space);
+ // Manage intermediate buffers
+ _memory_group.manage(&_mag);
+ _memory_group.manage(&_phase);
+
// Initialise gradient kernel
_gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+ // Manage intermediate buffers
+ _memory_group.manage(&_hog_space);
+
// Initialise orientation binning kernel
_orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
@@ -88,6 +95,8 @@
void CLHOGDescriptor::run()
{
+ _memory_group.acquire();
+
// Run gradient
_gradient.run();
@@ -96,4 +105,6 @@
// Run block normalization
CLScheduler::get().enqueue(_block_norm);
+
+ _memory_group.release();
}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index 2387474..51aeaed 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -29,8 +29,8 @@
using namespace arm_compute;
-CLHOGGradient::CLHOGGradient()
- : _derivative(), _mag_phase(), _gx(), _gy()
+CLHOGGradient::CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _derivative(), _mag_phase(), _gx(), _gy()
{
}
@@ -47,6 +47,10 @@
_gx.allocator()->init(info);
_gy.allocator()->init(info);
+ // Manage intermediate buffers
+ _memory_group.manage(&_gx);
+ _memory_group.manage(&_gy);
+
// Initialise derivate kernel
_derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
@@ -67,9 +71,13 @@
void CLHOGGradient::run()
{
+ _memory_group.acquire();
+
// Run derivative
_derivative.run();
// Run magnitude/phase kernel
CLScheduler::get().enqueue(_mag_phase);
+
+ _memory_group.release();
}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index b8f2224..8012c2f 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -25,17 +25,31 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/runtime/CL/CLArray.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/Scheduler.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-CLHOGMultiDetection::CLHOGMultiDetection()
- : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
- _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _gradient_kernel(),
+ _orient_bin_kernel(),
+ _block_norm_kernel(),
+ _hog_detect_kernel(),
+ _non_maxima_kernel(),
+ _hog_space(),
+ _hog_norm_space(),
+ _detection_windows(),
+ _mag(),
+ _phase(),
+ _non_maxima_suppression(false),
+ _num_orient_bin_kernel(0),
+ _num_block_norm_kernel(0),
+ _num_hog_detect_kernel(0)
{
}
@@ -114,12 +128,12 @@
_num_block_norm_kernel = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
_num_hog_detect_kernel = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
- _orient_bin_kernel = arm_compute::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
- _block_norm_kernel = arm_compute::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
- _hog_detect_kernel = arm_compute::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
- _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
- _hog_space = arm_compute::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
- _hog_norm_space = arm_compute::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
+ _orient_bin_kernel = arm_compute::support::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+ _block_norm_kernel = arm_compute::support::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+ _hog_detect_kernel = arm_compute::support::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
+ _non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+ _hog_space = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
+ _hog_norm_space = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
// Allocate tensors for magnitude and phase
TensorInfo info_mag(shape_img, Format::S16);
@@ -128,6 +142,10 @@
TensorInfo info_phase(shape_img, Format::U8);
_phase.allocator()->init(info_phase);
+ // Manage intermediate buffers
+ _memory_group.manage(&_mag);
+ _memory_group.manage(&_phase);
+
// Initialise gradient kernel
_gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
@@ -153,10 +171,17 @@
TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
_hog_space[i].allocator()->init(info_space);
+ // Manage intermediate buffers
+ _memory_group.manage(_hog_space.get() + i);
+
// Initialise orientation binning kernel
_orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
}
+ // Allocate intermediate tensors
+ _mag.allocator()->allocate();
+ _phase.allocator()->allocate();
+
// Configure CLTensor for the normalized HOG space and block normalization kernel
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
{
@@ -167,10 +192,19 @@
TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
_hog_norm_space[i].allocator()->init(tensor_info);
+ // Manage intermediate buffers
+ _memory_group.manage(_hog_norm_space.get() + i);
+
// Initialize block normalization kernel
_block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
}
+ // Allocate intermediate tensors
+ for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+ {
+ _hog_space[i].allocator()->allocate();
+ }
+
detection_window_strides->map(CLScheduler::get().queue(), true);
// Configure HOG detector kernel
@@ -187,14 +221,6 @@
_non_maxima_kernel->configure(_detection_windows, min_distance);
// Allocate intermediate tensors
- _mag.allocator()->allocate();
- _phase.allocator()->allocate();
-
- for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
- {
- _hog_space[i].allocator()->allocate();
- }
-
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
{
_hog_norm_space[i].allocator()->allocate();
@@ -205,6 +231,8 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
+ _memory_group.acquire();
+
// Reset detection window
_detection_windows->clear();
@@ -234,7 +262,9 @@
{
// Map detection windows array before computing non maxima suppression
_detection_windows->map(CLScheduler::get().queue(), true);
- _non_maxima_kernel->run(_non_maxima_kernel->window());
+ Scheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
_detection_windows->unmap(CLScheduler::get().queue());
}
-}
\ No newline at end of file
+
+ _memory_group.release();
+}
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 2db277f..059528f 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -27,7 +27,6 @@
#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
@@ -36,14 +35,28 @@
#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
#include "arm_compute/runtime/ITensorAllocator.h"
#include "arm_compute/runtime/Scheduler.h"
+#include "support/ToolchainSupport.h"
#include <cmath>
#include <utility>
using namespace arm_compute;
-CLHarrisCorners::CLHarrisCorners()
- : _sobel(), _harris_score(), _non_max_suppr(), _candidates(), _sort_euclidean(), _border_gx(), _border_gy(), _gx(), _gy(), _score(), _nonmax(), _corners_list(), _num_corner_candidates(0),
+CLHarrisCorners::CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _sobel(nullptr),
+ _harris_score(),
+ _non_max_suppr(),
+ _candidates(),
+ _sort_euclidean(),
+ _border_gx(),
+ _border_gy(),
+ _gx(),
+ _gy(),
+ _score(),
+ _nonmax(),
+ _corners_list(nullptr),
+ _num_corner_candidates(0),
_corners(nullptr)
{
}
@@ -62,6 +75,7 @@
const TensorShape shape = input->info()->tensor_shape();
const DataType dt = (gradient_size < 7) ? DataType::S16 : DataType::S32;
TensorInfo tensor_info(shape, 1, dt);
+
_gx.allocator()->init(tensor_info);
_gy.allocator()->init(tensor_info);
@@ -69,28 +83,32 @@
_score.allocator()->init(info_f32);
_nonmax.allocator()->init(info_f32);
- _corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+ _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+
+ // Manage intermediate buffers
+ _memory_group.manage(&_gx);
+ _memory_group.manage(&_gy);
/* Set/init Sobel kernel accordingly with gradient_size */
switch(gradient_size)
{
case 3:
{
- auto k = arm_compute::cpp14::make_unique<CLSobel3x3>();
+ auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
break;
}
case 5:
{
- auto k = arm_compute::cpp14::make_unique<CLSobel5x5>();
+ auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
break;
}
case 7:
{
- auto k = arm_compute::cpp14::make_unique<CLSobel7x7>();
+ auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
break;
@@ -99,37 +117,49 @@
ARM_COMPUTE_ERROR("Gradient size not implemented");
}
- // Configure border filling before harris score
- _border_gx.configure(&_gx, block_size / 2, border_mode, constant_border_value);
- _border_gy.configure(&_gy, block_size / 2, border_mode, constant_border_value);
-
// Normalization factor
const float norm_factor = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
const float pow4_normalization_factor = pow(norm_factor, 4);
+ // Manage intermediate buffers
+ _memory_group.manage(&_score);
+
// Set/init Harris Score kernel accordingly with block_size
_harris_score.configure(&_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
- // Init non-maxima suppression function
- _non_max_suppr.configure(&_score, &_nonmax, border_mode == BorderMode::UNDEFINED);
-
- // Init corner candidates kernel
- _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
-
- // Init euclidean distance
- _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
+ // Configure border filling using harris score kernel's block size
+ _border_gx.configure(&_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
+ _border_gy.configure(&_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
// Allocate intermediate buffers
_gx.allocator()->allocate();
_gy.allocator()->allocate();
+
+ // Manage intermediate buffers
+ _memory_group.manage(&_nonmax);
+
+ // Init non-maxima suppression function
+ _non_max_suppr.configure(&_score, &_nonmax, border_mode);
+
+ // Allocate intermediate buffers
_score.allocator()->allocate();
+
+ // Init corner candidates kernel
+ _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+
+ // Allocate intermediate buffers
_nonmax.allocator()->allocate();
+
+ // Init euclidean distance
+ _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
}
void CLHarrisCorners::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
+ _memory_group.acquire();
+
// Init to 0 number of corner candidates
_num_corner_candidates = 0;
@@ -144,7 +174,7 @@
CLScheduler::get().enqueue(_harris_score, false);
// Run non-maxima suppression
- CLScheduler::get().enqueue(_non_max_suppr);
+ _non_max_suppr.run();
// Run corner candidate kernel
_nonmax.map(true);
@@ -152,6 +182,8 @@
_nonmax.unmap();
_corners->map(CLScheduler::get().queue(), true);
- _sort_euclidean.run(_sort_euclidean.window());
+ Scheduler::get().schedule(&_sort_euclidean, Window::DimY);
_corners->unmap(CLScheduler::get().queue());
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLL2Normalize.cpp b/src/runtime/CL/functions/CLL2Normalize.cpp
new file mode 100644
index 0000000..99be8ca
--- /dev/null
+++ b/src/runtime/CL/functions/CLL2Normalize.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLL2Normalize.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLL2Normalize::CLL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
+{
+}
+
+void CLL2Normalize::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon)
+{
+ // Manage intermediate buffers
+ _memory_group.manage(&_sumsq);
+
+ // Configure kernels
+ _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
+ _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
+
+ // Allocate intermediate tensor
+ _sumsq.allocator()->allocate();
+}
+
+void CLL2Normalize::run()
+{
+ _memory_group.acquire();
+
+ _reduce_func.run();
+ CLScheduler::get().enqueue(_normalize_kernel, true);
+
+ _memory_group.release();
+}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index d7ce206..a395487 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -24,7 +24,6 @@
#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IPyramid.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
@@ -33,11 +32,18 @@
#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-CLLaplacianPyramid::CLLaplacianPyramid()
- : _num_levels(0), _gaussian_pyr_function(), _convf(), _subf(), _depth_function(), _gauss_pyr(), _conv_pyr()
+CLLaplacianPyramid::CLLaplacianPyramid() // NOLINT
+ : _num_levels(0),
+ _gaussian_pyr_function(),
+ _convf(),
+ _subf(),
+ _depth_function(),
+ _gauss_pyr(),
+ _conv_pyr()
{
}
@@ -64,8 +70,8 @@
// Create Gaussian Pyramid function
_gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
- _convf = arm_compute::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
- _subf = arm_compute::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
+ _convf = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
+ _subf = arm_compute::support::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
for(unsigned int i = 0; i < _num_levels; ++i)
{
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
index 1dfab74..678848b 100644
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
@@ -24,18 +24,21 @@
#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IPyramid.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
#include <cstddef>
using namespace arm_compute;
-CLLaplacianReconstruct::CLLaplacianReconstruct()
- : _tmp_pyr(), _addf(), _scalef(), _depthf()
+CLLaplacianReconstruct::CLLaplacianReconstruct() // NOLINT
+ : _tmp_pyr(),
+ _addf(),
+ _scalef(),
+ _depthf()
{
}
@@ -60,8 +63,8 @@
_tmp_pyr.init(pyramid_info);
// Allocate add and scale functions. Level 0 does not need to be scaled.
- _addf = arm_compute::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
- _scalef = arm_compute::cpp14::make_unique<CLScale[]>(num_levels - 1);
+ _addf = arm_compute::support::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
+ _scalef = arm_compute::support::cpp14::make_unique<CLScale[]>(num_levels - 1);
const size_t last_level = num_levels - 1;
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 263fb51..a89a45a 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -33,8 +33,9 @@
using namespace arm_compute;
-CLLocallyConnectedLayer::CLLocallyConnectedLayer()
- : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+ _is_first_run(false)
{
}
@@ -68,8 +69,8 @@
// Get convolved dimensions
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
- stride_x, stride_y, pad_x, pad_y, conv_info.round());
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+ conv_info);
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
@@ -99,8 +100,12 @@
shape_gemm.set(1, mat_input_rows);
_gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+ // Manage intermediate buffers
+ _memory_group.manage(&_input_im2col_reshaped);
+ _memory_group.manage(&_gemm_output);
+
// Configure kernels
- _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(conv_w, conv_h), conv_info, _has_bias);
_weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
_mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
_output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
@@ -120,6 +125,8 @@
CLScheduler::get().enqueue(_weights_reshape_kernel);
}
+ _memory_group.acquire();
+
// Run input reshaping
CLScheduler::get().enqueue(_input_im2col_kernel);
@@ -128,4 +135,6 @@
// Reshape output matrix
CLScheduler::get().enqueue(_output_col2im_kernel, false);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
index 51088cb..68b8c35 100644
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ b/src/runtime/CL/functions/CLMagnitude.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLMagnitude.h"
#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
{
- auto k = arm_compute::cpp14::make_unique<CLMagnitudePhaseKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
k->configure(input1, input2, output, nullptr, mag_type);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
index 56ba146..838f7e7 100644
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp
@@ -23,19 +23,19 @@
*/
#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
-#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
using namespace arm_compute;
CLMeanStdDev::CLMeanStdDev()
: _mean_stddev_kernel(),
+ _fill_border_kernel(),
_global_sum(),
_global_sum_squared()
{
}
-void CLMeanStdDev::configure(const ICLImage *input, float *mean, float *stddev)
+void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev)
{
_global_sum = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
@@ -45,9 +45,11 @@
}
_mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+ _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
}
void CLMeanStdDev::run()
{
+ CLScheduler::get().enqueue(_fill_border_kernel);
CLScheduler::get().enqueue(_mean_stddev_kernel);
}
diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
index 0c10f9a..55f9eaa 100644
--- a/src/runtime/CL/functions/CLMedian3x3.cpp
+++ b/src/runtime/CL/functions/CLMedian3x3.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLMedian3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLMedian3x3Kernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
index ad783d8..49dcbcb 100644
--- a/src/runtime/CL/functions/CLMinMaxLocation.cpp
+++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp
@@ -25,8 +25,8 @@
#include "arm_compute/core/CL/CLHelpers.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
CLMinMaxLocation::CLMinMaxLocation()
: _min_max_kernel(),
_min_max_loc_kernel(),
@@ -41,7 +41,7 @@
{
}
-void CLMinMaxLocation::configure(const ICLImage *input, int32_t *min, int32_t *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
+void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
{
ARM_COMPUTE_ERROR_ON(nullptr == min);
ARM_COMPUTE_ERROR_ON(nullptr == max);
@@ -67,8 +67,8 @@
CLScheduler::get().enqueue(_min_max_loc_kernel, false);
// Update min and max
- q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), _min);
- q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), _max);
+ q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_min));
+ q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_max));
// Update min and max count
if(_min_count != nullptr)
@@ -96,3 +96,4 @@
_max_loc->resize(max_corner_size);
}
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
index b593a6c..d37412f 100644
--- a/src/runtime/CL/functions/CLNonLinearFilter.cpp
+++ b/src/runtime/CL/functions/CLNonLinearFilter.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLNonLinearFilterKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLNonLinearFilterKernel>();
k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
index ca7d5ae..c0a0cef 100644
--- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
+++ b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode)
{
- auto k = arm_compute::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 2d89ebd..f4bd494 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -33,28 +33,26 @@
using namespace arm_compute;
CLNormalizationLayer::CLNormalizationLayer()
- : _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
+ : _norm_kernel(), _border_handler()
{
}
-void CLNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_ERROR_ON(input == nullptr);
- _squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type()));
+ // Configure normalization kernel
+ _norm_kernel.configure(input, output, norm_info);
- _norm_kernel.configure(input, &_squared_input, output, norm_info);
- _multiply_kernel.configure(input, input, &_squared_input, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
// Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
- _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
-
- // Allocate intermediate buffers
- _squared_input.allocator()->allocate();
+ _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
}
void CLNormalizationLayer::run()
{
- CLScheduler::get().enqueue(_multiply_kernel, false);
+ // Run border handler
CLScheduler::get().enqueue(_border_handler, false);
- CLScheduler::get().enqueue(_norm_kernel, false);
+
+ // Run normalization kernel
+ CLScheduler::get().enqueue(_norm_kernel);
}
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index a6b0eb3..d00b1b5 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -26,7 +26,6 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/runtime/CL/CLPyramid.h"
@@ -34,12 +33,27 @@
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/CLTensorAllocator.h"
#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-CLOpticalFlow::CLOpticalFlow()
- : _tracker_init_kernel(), _tracker_stage0_kernel(), _tracker_stage1_kernel(), _tracker_finalize_kernel(), _func_scharr(), _scharr_gx(), _scharr_gy(), _old_points(nullptr),
- _new_points_estimates(nullptr), _new_points(nullptr), _old_points_internal(), _new_points_internal(), _coefficient_table(), _old_values(), _num_levels(0)
+CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _tracker_init_kernel(),
+ _tracker_stage0_kernel(),
+ _tracker_stage1_kernel(),
+ _tracker_finalize_kernel(),
+ _func_scharr(),
+ _scharr_gx(),
+ _scharr_gy(),
+ _old_points(nullptr),
+ _new_points_estimates(nullptr),
+ _new_points(nullptr),
+ _old_points_internal(),
+ _new_points_internal(),
+ _coefficient_table(),
+ _old_values(),
+ _num_levels(0)
{
}
@@ -70,21 +84,21 @@
const int old_values_list_length = list_length * window_dimension * window_dimension;
// Create kernels and tensors
- _tracker_init_kernel = arm_compute::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
- _tracker_stage0_kernel = arm_compute::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
- _tracker_stage1_kernel = arm_compute::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
- _func_scharr = arm_compute::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
- _scharr_gx = arm_compute::cpp14::make_unique<CLTensor[]>(_num_levels);
- _scharr_gy = arm_compute::cpp14::make_unique<CLTensor[]>(_num_levels);
+ _tracker_init_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
+ _tracker_stage0_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
+ _tracker_stage1_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
+ _func_scharr = arm_compute::support::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
+ _scharr_gx = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
+ _scharr_gy = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
// Create internal keypoint arrays
- _old_points_internal = arm_compute::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
+ _old_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
_old_points_internal->resize(list_length);
- _new_points_internal = arm_compute::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
+ _new_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
_new_points_internal->resize(list_length);
- _coefficient_table = arm_compute::cpp14::make_unique<CLCoefficientTableArray>(list_length);
+ _coefficient_table = arm_compute::support::cpp14::make_unique<CLCoefficientTableArray>(list_length);
_coefficient_table->resize(list_length);
- _old_values = arm_compute::cpp14::make_unique<CLOldValueArray>(old_values_list_length);
+ _old_values = arm_compute::support::cpp14::make_unique<CLOldValueArray>(old_values_list_length);
_old_values->resize(old_values_list_length);
_new_points->resize(list_length);
@@ -103,6 +117,10 @@
_scharr_gx[i].allocator()->init(tensor_info);
_scharr_gy[i].allocator()->init(tensor_info);
+ // Manage intermediate buffers
+ _memory_group.manage(_scharr_gx.get() + i);
+ _memory_group.manage(_scharr_gy.get() + i);
+
// Init Scharr kernel
_func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
@@ -131,6 +149,8 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
+ _memory_group.acquire();
+
for(unsigned int level = _num_levels; level > 0; --level)
{
// Run Scharr kernel
@@ -147,4 +167,6 @@
}
CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
index a8cb22b..cf3fa7e 100644
--- a/src/runtime/CL/functions/CLPhase.cpp
+++ b/src/runtime/CL/functions/CLPhase.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLPhase.h"
#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
{
- auto k = arm_compute::cpp14::make_unique<CLMagnitudePhaseKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
k->configure(input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 8a86c2e..139d466 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLPixelWiseMultiplication::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
- auto k = arm_compute::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 1ef70f4..2cb7d63 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -24,14 +24,14 @@
#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
{
// Configure pooling kernel
- auto k = arm_compute::cpp14::make_unique<CLPoolingLayerKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLPoolingLayerKernel>();
k->configure(input, output, pool_info);
_kernel = std::move(k);
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
new file mode 100644
index 0000000..ed1f51c
--- /dev/null
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLQuantizationLayer::CLQuantizationLayer()
+ : _quantize_kernel(), _min_max_kernel(), _min_max()
+{
+}
+
+void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel.
+ _min_max_kernel.configure(input, &_min_max);
+
+ // Configure quantize kernel
+ _quantize_kernel.configure(input, output, &_min_max);
+
+ // Allocate min_max tensor
+ _min_max.allocator()->allocate();
+}
+
+void CLQuantizationLayer::run()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ // Reset min and max
+ _min_max_kernel.reset(q);
+
+ // Run min-max kernel
+ CLScheduler::get().enqueue(_min_max_kernel, false);
+
+ // Run quantize kernel
+ CLScheduler::get().enqueue(_quantize_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
new file mode 100644
index 0000000..0f480ee
--- /dev/null
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
+
+#include "arm_compute/core/CL/ICLArray.h"
+
+#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+ // Configure ROI pooling kernel
+ auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>();
+ k->configure(input, rois, output, pool_info);
+ _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
new file mode 100644
index 0000000..d02afb4
--- /dev/null
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
+{
+}
+
+void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+{
+ // Calculate number of WGs. 16 elements per thread, 8 threads per WG
+ unsigned int num_of_wg = ceil(input->info()->dimension(0) / 128.f);
+
+ // Calculate number of stages. First stage performs op and the rest reduction sum
+ // depending on the size of the input. Last stage should have only 1 WG.
+ _num_of_stages = num_of_wg / 128 + 2;
+
+ // Create temporary tensors
+ _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+
+ // Configure reduction operation kernels
+ _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
+ _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
+
+ TensorShape shape{ input->info()->tensor_shape() };
+ for(unsigned int i = 0; i < _num_of_stages - 1; i++)
+ {
+ shape.set(0, ceil(shape.x() / 128.f));
+ _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+ }
+
+ // Apply ReductionOperation only on first kernel
+ _memory_group.manage(_sums_vector.get());
+ _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, op);
+ _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+ // Apply ReductionOperation on intermediate stages
+ for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
+ {
+ _memory_group.manage(_sums_vector.get() + i);
+ _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
+ _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _sums_vector[i - 1].allocator()->allocate();
+ }
+
+ // Apply ReductionOperation on the last stage
+ const unsigned int last_stage = _num_of_stages - 1;
+ _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, ReductionOperation::SUM);
+ _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _sums_vector[last_stage - 1].allocator()->allocate();
+}
+
+void CLReductionOperation::run()
+{
+ _memory_group.acquire();
+
+ for(unsigned int i = 0; i < _num_of_stages; ++i)
+ {
+ CLScheduler::get().enqueue(_border_handlers_vector[i], false);
+ CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+ }
+
+ _memory_group.release();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
index f6b1713..bc3fd4e 100644
--- a/src/runtime/CL/functions/CLRemap.cpp
+++ b/src/runtime/CL/functions/CLRemap.cpp
@@ -26,10 +26,10 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -43,7 +43,7 @@
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
- auto k = arm_compute::cpp14::make_unique<CLRemapKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLRemapKernel>();
k->configure(input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
new file mode 100644
index 0000000..2ce83dc
--- /dev/null
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLReshapeLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLReshapeLayerKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 043f873..49b0275 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -26,19 +26,14 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
{
- ARM_COMPUTE_ERROR_ON(output == input);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- auto k = arm_compute::cpp14::make_unique<CLScaleKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>();
k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
index c8bc465..73f8673 100644
--- a/src/runtime/CL/functions/CLScharr3x3.cpp
+++ b/src/runtime/CL/functions/CLScharr3x3.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLScharr3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLScharr3x3Kernel>();
k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
index 6b74eba..e227e58 100644
--- a/src/runtime/CL/functions/CLSobel3x3.cpp
+++ b/src/runtime/CL/functions/CLSobel3x3.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLSobel3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3Kernel>();
k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
index 098b546..d4bc855 100644
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp
@@ -33,8 +33,8 @@
using namespace arm_compute;
-CLSobel5x5::CLSobel5x5()
- : _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+CLSobel5x5::CLSobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
{
}
@@ -51,6 +51,8 @@
{
_tmp_x.allocator()->init(tensor_info);
_tmp_y.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_x);
+ _memory_group.manage(&_tmp_y);
_sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
@@ -59,6 +61,7 @@
else if(run_sobel_x)
{
_tmp_x.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_x);
_sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
@@ -66,6 +69,7 @@
else if(run_sobel_y)
{
_tmp_y.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_y);
_sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_y.allocator()->allocate();
@@ -76,6 +80,11 @@
void CLSobel5x5::run()
{
CLScheduler::get().enqueue(_border_handler, false);
+
+ _memory_group.acquire();
+
CLScheduler::get().enqueue(_sobel_hor, false);
CLScheduler::get().enqueue(_sobel_vert);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
index db84fa9..6083090 100644
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp
@@ -33,8 +33,8 @@
using namespace arm_compute;
-CLSobel7x7::CLSobel7x7()
- : _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+CLSobel7x7::CLSobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
{
}
@@ -51,6 +51,8 @@
{
_tmp_x.allocator()->init(tensor_info);
_tmp_y.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_x);
+ _memory_group.manage(&_tmp_y);
_sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
@@ -59,6 +61,7 @@
else if(run_sobel_x)
{
_tmp_x.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_x);
_sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
@@ -66,6 +69,7 @@
else if(run_sobel_y)
{
_tmp_y.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_y);
_sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_y.allocator()->allocate();
@@ -76,6 +80,11 @@
void CLSobel7x7::run()
{
CLScheduler::get().enqueue(_border_handler, false);
+
+ _memory_group.acquire();
+
CLScheduler::get().enqueue(_sobel_hor, false);
CLScheduler::get().enqueue(_sobel_vert);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 2a78c58..7505a2c 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -25,29 +25,34 @@
#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
using namespace arm_compute;
-CLSoftmaxLayer::CLSoftmaxLayer()
- : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
{
}
void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
// Create intermediate tensors shapes
- _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
+ _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
TensorShape shape = input->info()->tensor_shape();
shape.set(0, 1);
- TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
+ TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
_max.allocator()->init(tensor_info_max_sum);
_sum.allocator()->init(tensor_info_max_sum);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp);
+ _memory_group.manage(&_max);
+ _memory_group.manage(&_sum);
+
// Configure Kernels
_max_kernel.configure(input, &_max);
_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
@@ -61,7 +66,11 @@
void CLSoftmaxLayer::run()
{
+ _memory_group.acquire();
+
CLScheduler::get().enqueue(_max_kernel, false);
CLScheduler::get().enqueue(_shift_exp_sum_kernel, false);
CLScheduler::get().enqueue(_norm_kernel);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
index 743ed5e..d187650 100644
--- a/src/runtime/CL/functions/CLTableLookup.cpp
+++ b/src/runtime/CL/functions/CLTableLookup.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLTableLookup.h"
#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLTableLookupKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLTableLookupKernel>();
k->configure(input, lut, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
index e70f932..1b30b77 100644
--- a/src/runtime/CL/functions/CLThreshold.cpp
+++ b/src/runtime/CL/functions/CLThreshold.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLThreshold.h"
#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
{
- auto k = arm_compute::cpp14::make_unique<CLThresholdKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLThresholdKernel>();
k->configure(input, output, threshold, false_value, true_value, type, upper);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index d802b4f..cd19e25 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLTranspose.h"
#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLTranspose::configure(const ICLTensor *input, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLTransposeKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
index 537e0d9..f785c75 100644
--- a/src/runtime/CL/functions/CLWarpAffine.cpp
+++ b/src/runtime/CL/functions/CLWarpAffine.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLWarpAffineKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLWarpAffineKernel>();
k->configure(input, output, matrix, policy);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
index a552ab4..b445b3b 100644
--- a/src/runtime/CL/functions/CLWarpPerspective.cpp
+++ b/src/runtime/CL/functions/CLWarpPerspective.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLWarpPerspectiveKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLWarpPerspectiveKernel>();
k->configure(input, output, matrix, policy);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 8869330..a83a0bc 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -28,91 +28,89 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
+#include <condition_variable>
#include <iostream>
-#include <semaphore.h>
+#include <mutex>
#include <system_error>
#include <thread>
-using namespace arm_compute;
-
-class arm_compute::Thread
+namespace arm_compute
+{
+class Thread
{
public:
- /** Start a new thread
- */
+ /** Start a new thread. */
Thread();
+
Thread(const Thread &) = delete;
Thread &operator=(const Thread &) = delete;
Thread(Thread &&) = delete;
Thread &operator=(Thread &&) = delete;
- /** Make the thread join
- */
+
+ /** Destructor. Make the thread join. */
~Thread();
+
/** Request the worker thread to start executing the given kernel
* This function will return as soon as the kernel has been sent to the worker thread.
* wait() needs to be called to ensure the execution is complete.
*/
- void start(ICPPKernel *kernel, const Window &window);
- /** Wait for the current kernel execution to complete
- */
+ void start(ICPPKernel *kernel, const Window &window, const ThreadInfo &info);
+
+ /** Wait for the current kernel execution to complete. */
void wait();
- /** Function ran by the worker thread
- */
+
+ /** Function ran by the worker thread. */
void worker_thread();
private:
- std::thread _thread;
- ICPPKernel *_kernel{ nullptr };
- Window _window;
- sem_t _wait_for_work;
- sem_t _job_complete;
- std::exception_ptr _current_exception;
+ std::thread _thread;
+ ICPPKernel *_kernel{ nullptr };
+ Window _window;
+ ThreadInfo _info;
+ std::mutex _m;
+ std::condition_variable _cv;
+ bool _wait_for_work{ false };
+ bool _job_complete{ true };
+ std::exception_ptr _current_exception;
};
Thread::Thread()
- : _thread(), _window(), _wait_for_work(), _job_complete(), _current_exception(nullptr)
+ : _thread(), _window(), _info(), _m(), _cv(), _current_exception(nullptr)
{
- int ret = sem_init(&_wait_for_work, 0, 0);
- ARM_COMPUTE_ERROR_ON(ret < 0);
- ARM_COMPUTE_UNUSED(ret);
-
- ret = sem_init(&_job_complete, 0, 0);
- ARM_COMPUTE_ERROR_ON(ret < 0);
- ARM_COMPUTE_UNUSED(ret);
-
_thread = std::thread(&Thread::worker_thread, this);
}
Thread::~Thread()
{
- ARM_COMPUTE_ERROR_ON(!_thread.joinable());
-
- start(nullptr, Window());
- _thread.join();
-
- int ret = sem_destroy(&_wait_for_work);
- ARM_COMPUTE_ERROR_ON(ret < 0);
- ARM_COMPUTE_UNUSED(ret);
-
- ret = sem_destroy(&_job_complete);
- ARM_COMPUTE_ERROR_ON(ret < 0);
- ARM_COMPUTE_UNUSED(ret);
+ // Make sure worker thread has ended
+ if(_thread.joinable())
+ {
+ start(nullptr, Window(), ThreadInfo());
+ _thread.join();
+ }
}
-void Thread::start(ICPPKernel *kernel, const Window &window)
+void Thread::start(ICPPKernel *kernel, const Window &window, const ThreadInfo &info)
{
_kernel = kernel;
_window = window;
- int ret = sem_post(&_wait_for_work);
- ARM_COMPUTE_UNUSED(ret);
- ARM_COMPUTE_ERROR_ON(ret < 0);
+ _info = info;
+
+ {
+ std::lock_guard<std::mutex> lock(_m);
+ _wait_for_work = true;
+ _job_complete = false;
+ }
+ _cv.notify_one();
}
void Thread::wait()
{
- int ret = sem_wait(&_job_complete);
- ARM_COMPUTE_UNUSED(ret);
- ARM_COMPUTE_ERROR_ON(ret < 0);
+ {
+ std::unique_lock<std::mutex> lock(_m);
+ _cv.wait(lock, [&] { return _job_complete; });
+ }
+
if(_current_exception)
{
std::rethrow_exception(_current_exception);
@@ -121,9 +119,14 @@
void Thread::worker_thread()
{
- while(sem_wait(&_wait_for_work) >= 0)
+ while(true)
{
+ std::unique_lock<std::mutex> lock(_m);
+ _cv.wait(lock, [&] { return _wait_for_work; });
+ _wait_for_work = false;
+
_current_exception = nullptr;
+
// Time to exit
if(_kernel == nullptr)
{
@@ -133,49 +136,40 @@
try
{
_window.validate();
- _kernel->run(_window);
+ _kernel->run(_window, _info);
}
catch(...)
{
_current_exception = std::current_exception();
}
- int ret = sem_post(&_job_complete);
- ARM_COMPUTE_UNUSED(ret);
- ARM_COMPUTE_ERROR_ON(ret < 0);
+
+ _job_complete = true;
+ lock.unlock();
+ _cv.notify_one();
}
-
- ARM_COMPUTE_ERROR("Wait failed");
}
-namespace
-{
-void delete_threads(Thread *t)
-{
- delete[] t;
-}
-} // namespace
-
CPPScheduler &CPPScheduler::get()
{
static CPPScheduler scheduler;
return scheduler;
}
-unsigned int CPPScheduler::num_threads() const
-{
- return _num_threads;
-}
-
CPPScheduler::CPPScheduler()
: _num_threads(std::thread::hardware_concurrency()),
- _threads(std::unique_ptr<Thread[], void(*)(Thread *)>(new Thread[std::thread::hardware_concurrency() - 1], delete_threads))
+ _threads(_num_threads - 1)
{
}
void CPPScheduler::set_num_threads(unsigned int num_threads)
{
- const unsigned int num_cores = std::thread::hardware_concurrency();
- _num_threads = num_threads == 0 ? num_cores : num_threads;
+ _num_threads = num_threads == 0 ? std::thread::hardware_concurrency() : num_threads;
+ _threads.resize(_num_threads - 1);
+}
+
+unsigned int CPPScheduler::num_threads() const
+{
+ return _num_threads;
}
void CPPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
@@ -183,43 +177,51 @@
ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
/** [Scheduler example] */
+ ThreadInfo info;
+ info.cpu_info = _info;
+
const Window &max_window = kernel->window();
const unsigned int num_iterations = max_window.num_iterations(split_dimension);
- const unsigned int num_threads = std::min(num_iterations, _num_threads);
+ info.num_threads = std::min(num_iterations, _num_threads);
- if(!kernel->is_parallelisable() || 1 == num_threads)
+ if(num_iterations == 0)
{
- kernel->run(max_window);
+ return;
+ }
+
+ if(!kernel->is_parallelisable() || info.num_threads == 1)
+ {
+ kernel->run(max_window, info);
}
else
{
- for(unsigned int t = 0; t < num_threads; ++t)
- {
- Window win = max_window.split_window(split_dimension, t, num_threads);
- win.set_thread_id(t);
- win.set_num_threads(num_threads);
+ int t = 0;
+ auto thread_it = _threads.begin();
- if(t != num_threads - 1)
- {
- _threads[t].start(kernel, win);
- }
- else
- {
- kernel->run(win);
- }
+ for(; t < info.num_threads - 1; ++t, ++thread_it)
+ {
+ Window win = max_window.split_window(split_dimension, t, info.num_threads);
+ info.thread_id = t;
+ thread_it->start(kernel, win, info);
}
+ // Run last part on main thread
+ Window win = max_window.split_window(split_dimension, t, info.num_threads);
+ info.thread_id = t;
+ kernel->run(win, info);
+
try
{
- for(unsigned int t = 1; t < num_threads; ++t)
+ for(auto &thread : _threads)
{
- _threads[t - 1].wait();
+ thread.wait();
}
}
catch(const std::system_error &e)
{
- std::cout << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
+ std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
}
}
/** [Scheduler example] */
}
+} // namespace arm_compute
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index f086813..c8285b4 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -27,8 +27,8 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Utils.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
SingleThreadScheduler &SingleThreadScheduler::get()
{
static SingleThreadScheduler scheduler;
@@ -38,15 +38,19 @@
void SingleThreadScheduler::set_num_threads(unsigned int num_threads)
{
ARM_COMPUTE_UNUSED(num_threads);
+ ARM_COMPUTE_ERROR_ON(num_threads != 1);
}
void SingleThreadScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
{
ARM_COMPUTE_UNUSED(split_dimension);
- kernel->run(kernel->window());
+ ThreadInfo info;
+ info.cpu_info = cpu_info();
+ kernel->run(kernel->window(), info);
}
unsigned int SingleThreadScheduler::num_threads() const
{
return 1;
}
+} // namespace arm_compute
diff --git a/src/runtime/Distribution1D.cpp b/src/runtime/Distribution1D.cpp
index b067674..3431834 100644
--- a/src/runtime/Distribution1D.cpp
+++ b/src/runtime/Distribution1D.cpp
@@ -24,14 +24,14 @@
#include "arm_compute/runtime/Distribution1D.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <cstdint>
using namespace arm_compute;
Distribution1D::Distribution1D(size_t num_bins, int32_t offset, uint32_t range)
- : IDistribution1D(num_bins, offset, range), _data(arm_compute::cpp14::make_unique<uint32_t[]>(num_bins))
+ : IDistribution1D(num_bins, offset, range), _data(arm_compute::support::cpp14::make_unique<uint32_t[]>(num_bins))
{
}
diff --git a/src/runtime/HOG.cpp b/src/runtime/HOG.cpp
index 5d533dd..01640bb 100644
--- a/src/runtime/HOG.cpp
+++ b/src/runtime/HOG.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/HOG.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
@@ -37,7 +37,7 @@
{
ARM_COMPUTE_ERROR_ON(nullptr != _descriptor);
_info = input;
- _descriptor = arm_compute::cpp14::make_unique<float[]>(_info.descriptor_size());
+ _descriptor = arm_compute::support::cpp14::make_unique<float[]>(_info.descriptor_size());
}
float *HOG::descriptor() const
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
new file mode 100644
index 0000000..4292469
--- /dev/null
+++ b/src/runtime/IScheduler.cpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/IScheduler.h"
+
+#include <array>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+namespace
+{
+unsigned int get_cpu_impl()
+{
+#ifndef BARE_METAL
+ int fd = open("/proc/cpuinfo", 0); // NOLINT
+ std::array<char, 1200> buff{ {} };
+ char *pos = nullptr;
+ char *end = nullptr;
+ bool foundid = false;
+
+ int cpu = sched_getcpu();
+
+ if(fd == -1)
+ {
+ return 0;
+ }
+
+ int charsread = read(fd, buff.data(), 1200);
+ pos = buff.data();
+ end = buff.data() + charsread;
+
+ close(fd);
+
+ /* So, to date I've encountered two formats for /proc/cpuinfo.
+ *
+ * One of them just lists processor : n for each processor (with no
+ * other info), then at the end lists part information for the current
+ * CPU.
+ *
+ * The other has an entire clause (including part number info) for each
+ * CPU in the system, with "processor : n" headers.
+ *
+ * We can cope with either of these formats by waiting to see
+ * "processor: n" (where n = our CPU ID), and then looking for the next
+ * "CPU part" field.
+ */
+ while(pos < end)
+ {
+ if(foundid && strncmp(pos, "CPU part", 8) == 0)
+ {
+ /* Found part number */
+ pos += 11;
+
+ for(char *ch = pos; ch < end; ch++)
+ {
+ if(*ch == '\n')
+ {
+ *ch = '\0';
+ break;
+ }
+ }
+
+ return strtoul(pos, nullptr, 0);
+ }
+
+ if(strncmp(pos, "processor", 9) == 0)
+ {
+ /* Found processor ID, see if it's ours. */
+ pos += 11;
+
+ for(char *ch = pos; ch < end; ch++)
+ {
+ if(*ch == '\n')
+ {
+ *ch = '\0';
+ break;
+ }
+ }
+
+ int num = strtol(pos, nullptr, 0);
+
+ if(num == cpu)
+ {
+ foundid = true;
+ }
+ }
+
+ while(pos < end)
+ {
+ char ch = *pos++;
+ if(ch == '\n' || ch == '\0')
+ {
+ break;
+ }
+ }
+ }
+#endif /* BARE_METAL */
+
+ return 0;
+}
+} // namespace
+
+namespace arm_compute
+{
+IScheduler::IScheduler()
+{
+ switch(get_cpu_impl())
+ {
+ case 0xd03:
+ _info.CPU = CPUTarget::A53;
+ break;
+ default:
+#ifdef __arm__
+ _info.CPU = CPUTarget::ARMV7;
+#elif __aarch64__
+ _info.CPU = CPUTarget::ARMV8;
+#else /* __arm__ || __aarch64__ */
+ _info.CPU = CPUTarget::INTRINSICS;
+#endif /* __arm__ || __aarch64__ */
+ break;
+ }
+
+ _info.L1_size = 31000;
+ _info.L2_size = 500000;
+}
+
+void IScheduler::set_target(CPUTarget target)
+{
+ _info.CPU = target;
+}
+
+CPUInfo IScheduler::cpu_info() const
+{
+ return _info;
+}
+} // namespace arm_compute
diff --git a/src/runtime/LutAllocator.cpp b/src/runtime/LutAllocator.cpp
index 17baf21..eb9051c 100644
--- a/src/runtime/LutAllocator.cpp
+++ b/src/runtime/LutAllocator.cpp
@@ -23,7 +23,7 @@
*/
#include "arm_compute/runtime/LutAllocator.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
@@ -39,7 +39,7 @@
void LutAllocator::allocate()
{
- _buffer = arm_compute::cpp14::make_unique<uint8_t[]>(size());
+ _buffer = arm_compute::support::cpp14::make_unique<uint8_t[]>(size());
}
uint8_t *LutAllocator::lock()
diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp
new file mode 100644
index 0000000..4dfa28b
--- /dev/null
+++ b/src/runtime/MemoryManagerOnDemand.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/ILifetimeManager.h"
+#include "arm_compute/runtime/IPoolManager.h"
+
+#include <memory>
+
+using namespace arm_compute;
+
+MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager)
+ : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager)), _allocator(nullptr), _is_finalized(false), _num_pools(1)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr, "Lifetime manager not specified correctly!");
+ ARM_COMPUTE_ERROR_ON_MSG(!_pool_mgr, "Pool manager not specified correctly!");
+}
+
+bool MemoryManagerOnDemand::is_finalized() const
+{
+ return _is_finalized;
+}
+
+void MemoryManagerOnDemand::set_num_pools(unsigned int num_pools)
+{
+ ARM_COMPUTE_ERROR_ON(num_pools == 0);
+ _num_pools = num_pools;
+}
+
+void MemoryManagerOnDemand::set_allocator(IAllocator *allocator)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(is_finalized(), "Memory manager is already finalized!");
+ ARM_COMPUTE_ERROR_ON(allocator == nullptr);
+ _allocator = allocator;
+}
+
+ILifetimeManager *MemoryManagerOnDemand::lifetime_manager()
+{
+ return _lifetime_mgr.get();
+}
+
+IPoolManager *MemoryManagerOnDemand::pool_manager()
+{
+ return _pool_mgr.get();
+}
+
+void MemoryManagerOnDemand::finalize()
+{
+ ARM_COMPUTE_ERROR_ON_MSG(is_finalized(), "Memory manager is already finalized!");
+ ARM_COMPUTE_ERROR_ON(!_lifetime_mgr);
+ ARM_COMPUTE_ERROR_ON(!_pool_mgr);
+ ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr->are_all_finalized(), "All the objects have not been finalized! ");
+ ARM_COMPUTE_ERROR_ON(_allocator == nullptr);
+
+ // Create pools
+ auto pool_template = _lifetime_mgr->create_pool(_allocator);
+ for(int i = _num_pools; i > 1; --i)
+ {
+ auto pool = pool_template->duplicate();
+ _pool_mgr->register_pool(std::move(pool));
+ }
+ _pool_mgr->register_pool(std::move(pool_template));
+
+ // Set finalized to true
+ _is_finalized = true;
+}
diff --git a/src/runtime/MultiHOG.cpp b/src/runtime/MultiHOG.cpp
index 003dc93..e0b60b1 100644
--- a/src/runtime/MultiHOG.cpp
+++ b/src/runtime/MultiHOG.cpp
@@ -24,13 +24,13 @@
#include "arm_compute/runtime/MultiHOG.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IMultiHOG.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
MultiHOG::MultiHOG(size_t num_models)
- : _num_models(num_models), _model(arm_compute::cpp14::make_unique<HOG[]>(_num_models))
+ : _num_models(num_models), _model(arm_compute::support::cpp14::make_unique<HOG[]>(_num_models))
{
}
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index 6f0da85..23d9872 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -27,13 +27,14 @@
using namespace arm_compute;
-INESimpleFunction::INESimpleFunction()
- : _kernel(), _border_handler()
+INESimpleFunction::INESimpleFunction() // NOLINT
+ : _kernel(),
+ _border_handler()
{
}
void INESimpleFunction::run()
{
- _border_handler.run(_border_handler.window());
+ NEScheduler::get().schedule(&_border_handler, Window::DimZ);
NEScheduler::get().schedule(_kernel.get(), Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
index b39feb3..b4620f1 100644
--- a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
+++ b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void NEAbsoluteDifference::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NEAbsoluteDifferenceKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEAbsoluteDifferenceKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEAccumulate.cpp b/src/runtime/NEON/functions/NEAccumulate.cpp
index c39abfc..49524d2 100644
--- a/src/runtime/NEON/functions/NEAccumulate.cpp
+++ b/src/runtime/NEON/functions/NEAccumulate.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void NEAccumulate::configure(const ITensor *input, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NEAccumulateKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEAccumulateKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
@@ -41,13 +41,13 @@
{
if(use_fp16)
{
- auto k = arm_compute::cpp14::make_unique<NEAccumulateWeightedFP16Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEAccumulateWeightedFP16Kernel>();
k->configure(input, alpha, output);
_kernel = std::move(k);
}
else
{
- auto k = arm_compute::cpp14::make_unique<NEAccumulateWeightedKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEAccumulateWeightedKernel>();
k->configure(input, alpha, output);
_kernel = std::move(k);
}
@@ -55,7 +55,7 @@
void NEAccumulateSquared::configure(const ITensor *input, uint32_t shift, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NEAccumulateSquaredKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEAccumulateSquaredKernel>();
k->configure(input, shift, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index f5d81d7..57a1738 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -23,14 +23,14 @@
*/
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-void NEActivationLayer::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
{
- auto k = arm_compute::cpp14::make_unique<NEActivationLayerKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernel>();
k->configure(input, output, activation_info);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 50cc38b..11f5aa7 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
{
- auto k = arm_compute::cpp14::make_unique<NEArithmeticAdditionKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEArithmeticAdditionKernel>();
k->configure(input1, input2, output, policy);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index a3d27c0..37586af 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
{
- auto k = arm_compute::cpp14::make_unique<NEArithmeticSubtractionKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEArithmeticSubtractionKernel>();
k->configure(input1, input2, output, policy);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index a24429c..ef79b02 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -37,7 +37,7 @@
{
}
-void NEBatchNormalizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
{
// Configure kernel
_norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
index 5aafc51..7982095 100644
--- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void NEBitwiseAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NEBitwiseAndKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEBitwiseAndKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
index af3df6e..c55957e 100644
--- a/src/runtime/NEON/functions/NEBitwiseNot.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void NEBitwiseNot::configure(const ITensor *input, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NEBitwiseNotKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEBitwiseNotKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
index d12c5e5..01036da 100644
--- a/src/runtime/NEON/functions/NEBitwiseOr.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void NEBitwiseOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NEBitwiseOrKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEBitwiseOrKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
index 65c943e..4591698 100644
--- a/src/runtime/NEON/functions/NEBitwiseXor.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void NEBitwiseXor::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NEBitwiseXorKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEBitwiseXorKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEBox3x3.cpp b/src/runtime/NEON/functions/NEBox3x3.cpp
index 7f0b45d..46cf259 100644
--- a/src/runtime/NEON/functions/NEBox3x3.cpp
+++ b/src/runtime/NEON/functions/NEBox3x3.cpp
@@ -23,9 +23,9 @@
*/
#include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -35,13 +35,13 @@
{
if(use_fp16)
{
- auto k = arm_compute::cpp14::make_unique<NEBox3x3FP16Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEBox3x3FP16Kernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
}
else
{
- auto k = arm_compute::cpp14::make_unique<NEBox3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEBox3x3Kernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 26f31f5..9be1df6 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -24,7 +24,6 @@
#include "arm_compute/runtime/NEON/functions/NECannyEdge.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
@@ -35,14 +34,27 @@
#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
#include <cstring>
#include <utility>
using namespace arm_compute;
-NECannyEdge::NECannyEdge()
- : _sobel(), _gradient(), _non_max_suppr(), _edge_trace(), _border_mag_gradient(), _border_edge_trace(), _gx(), _gy(), _magnitude(), _phase(), _nonmax(), _output(nullptr)
+NECannyEdge::NECannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _sobel(),
+ _gradient(),
+ _non_max_suppr(),
+ _edge_trace(),
+ _border_mag_gradient(),
+ _border_edge_trace(),
+ _gx(),
+ _gy(),
+ _magnitude(),
+ _phase(),
+ _nonmax(),
+ _output(nullptr)
{
}
@@ -82,22 +94,26 @@
_phase.allocator()->init(info);
_nonmax.allocator()->init(info);
+ // Manage intermediate buffers
+ _memory_group.manage(&_gx);
+ _memory_group.manage(&_gy);
+
// Configure/Init sobelNxN
if(gradient_size == 3)
{
- auto k = arm_compute::cpp14::make_unique<NESobel3x3>();
+ auto k = arm_compute::support::cpp14::make_unique<NESobel3x3>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
}
else if(gradient_size == 5)
{
- auto k = arm_compute::cpp14::make_unique<NESobel5x5>();
+ auto k = arm_compute::support::cpp14::make_unique<NESobel5x5>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
}
else if(gradient_size == 7)
{
- auto k = arm_compute::cpp14::make_unique<NESobel7x7>();
+ auto k = arm_compute::support::cpp14::make_unique<NESobel7x7>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
}
@@ -106,20 +122,31 @@
ARM_COMPUTE_ERROR("Gradient size not supported\n");
}
+ // Manage intermediate buffers
+ _memory_group.manage(&_magnitude);
+ _memory_group.manage(&_phase);
+
// Configure gradient
if(use_fp16)
{
- auto k = arm_compute::cpp14::make_unique<NEGradientFP16Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEGradientFP16Kernel>();
k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
_gradient = std::move(k);
}
else
{
- auto k = arm_compute::cpp14::make_unique<NEGradientKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEGradientKernel>();
k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
_gradient = std::move(k);
}
+ // Allocate intermediate tensors
+ _gx.allocator()->allocate();
+ _gy.allocator()->allocate();
+
+ // Manage intermediate buffers
+ _memory_group.manage(&_nonmax);
+
// Configure non-maxima suppression
_non_max_suppr.configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED);
@@ -127,6 +154,10 @@
// it. If border mode is undefined filling the border is a nop.
_border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), border_mode, constant_border_value);
+ // Allocate intermediate tensors
+ _phase.allocator()->allocate();
+ _magnitude.allocator()->allocate();
+
// Configure edge tracing
_edge_trace.configure(&_nonmax, output);
@@ -134,10 +165,6 @@
_border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, 0);
// Allocate intermediate tensors
- _gx.allocator()->allocate();
- _gy.allocator()->allocate();
- _phase.allocator()->allocate();
- _magnitude.allocator()->allocate();
_nonmax.allocator()->allocate();
}
@@ -146,11 +173,13 @@
ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
ARM_COMPUTE_ERROR_ON(_output == nullptr);
+ _memory_group.acquire();
+
// Run sobelNxN
_sobel->run();
// Fill border before non-maxima suppression. Nop for border mode undefined.
- _border_mag_gradient.run(_border_mag_gradient.window());
+ NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ);
// Run gradient
NEScheduler::get().schedule(_gradient.get(), Window::DimY);
@@ -162,8 +191,10 @@
memset(_output->buffer(), 0, _output->info()->total_size());
// Fill border before edge trace
- _border_edge_trace.run(_border_edge_trace.window());
+ NEScheduler::get().schedule(&_border_edge_trace, Window::DimZ);
// Run edge tracing
- _edge_trace.run(_edge_trace.window());
+ NEScheduler::get().schedule(&_edge_trace, Window::DimY);
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEChannelCombine.cpp b/src/runtime/NEON/functions/NEChannelCombine.cpp
index 84d4fff..9166aa9 100644
--- a/src/runtime/NEON/functions/NEChannelCombine.cpp
+++ b/src/runtime/NEON/functions/NEChannelCombine.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,14 +32,14 @@
void NEChannelCombine::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NEChannelCombineKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEChannelCombineKernel>();
k->configure(plane0, plane1, plane2, plane3, output);
_kernel = std::move(k);
}
void NEChannelCombine::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output)
{
- auto k = arm_compute::cpp14::make_unique<NEChannelCombineKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEChannelCombineKernel>();
k->configure(plane0, plane1, plane2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEChannelExtract.cpp b/src/runtime/NEON/functions/NEChannelExtract.cpp
index 634e918..7b8a993 100644
--- a/src/runtime/NEON/functions/NEChannelExtract.cpp
+++ b/src/runtime/NEON/functions/NEChannelExtract.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,14 +32,14 @@
void NEChannelExtract::configure(const ITensor *input, Channel channel, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NEChannelExtractKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEChannelExtractKernel>();
k->configure(input, channel, output);
_kernel = std::move(k);
}
void NEChannelExtract::configure(const IMultiImage *input, Channel channel, IImage *output)
{
- auto k = arm_compute::cpp14::make_unique<NEChannelExtractKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEChannelExtractKernel>();
k->configure(input, channel, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEColorConvert.cpp b/src/runtime/NEON/functions/NEColorConvert.cpp
index bbaa832..b9fe1ff 100644
--- a/src/runtime/NEON/functions/NEColorConvert.cpp
+++ b/src/runtime/NEON/functions/NEColorConvert.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,28 +32,28 @@
void NEColorConvert::configure(const ITensor *input, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
void NEColorConvert::configure(const IMultiImage *input, IImage *output)
{
- auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
void NEColorConvert::configure(const IImage *input, IMultiImage *output)
{
- auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
void NEColorConvert::configure(const IMultiImage *input, IMultiImage *output)
{
- auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index 3f39ae2..f10ffa6 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -24,7 +24,6 @@
#include "arm_compute/runtime/NEON/functions/NEConvolution.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
#include "arm_compute/core/PixelValue.h"
@@ -33,6 +32,7 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
#include <array>
#include <utility>
@@ -41,15 +41,15 @@
void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<NEConvolution3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEConvolution3x3Kernel>();
k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
}
template <unsigned int matrix_size>
-NEConvolutionSquare<matrix_size>::NEConvolutionSquare()
- : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+NEConvolutionSquare<matrix_size>::NEConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
{
}
@@ -72,6 +72,10 @@
_tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, intermediate_type));
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp);
+
+ // Calculate scale
if(scale == 0)
{
scale = calculate_matrix_scale(conv, matrix_size);
@@ -94,12 +98,16 @@
template <unsigned int matrix_size>
void NEConvolutionSquare<matrix_size>::run()
{
- _border_handler.run(_border_handler.window());
+ NEScheduler::get().schedule(&_border_handler, Window::DimZ);
if(_is_separable)
{
+ _memory_group.acquire();
+
NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+
+ _memory_group.release();
}
else
{
@@ -113,7 +121,7 @@
void NEConvolutionRectangle::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<NEConvolutionRectangleKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEConvolutionRectangleKernel>();
k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index bd688cf..40862fc 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -23,32 +23,41 @@
*/
#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
#include <cmath>
#include <tuple>
-using namespace arm_compute;
-
-NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights()
- : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+namespace arm_compute
+{
+NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
{
}
void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
if(biases != nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
@@ -69,8 +78,11 @@
TensorInfo info_wr(shape_wr, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
_weights_reshaped.allocator()->init(info_wr);
+ _memory_group.manage(&_weights_reshaped);
+
_weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
_weights_transposed_kernel.configure(&_weights_reshaped, output);
+
_weights_reshaped.allocator()->allocate();
}
else
@@ -81,32 +93,34 @@
void NEConvolutionLayerReshapeWeights::run()
{
+ _memory_group.acquire();
+
NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+
if(_transpose1xW)
{
NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
}
+
+ _memory_group.release();
}
-NEConvolutionLayer::NEConvolutionLayer()
- : _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
- _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _output_col2im_kernel(),
+ _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _workspace(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
{
}
void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
if(biases != nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
@@ -131,94 +145,165 @@
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size() : weights->info()->dimension(0);
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
- stride_x, stride_y, pad_x, pad_y, conv_info.round());
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+ const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0);
+ const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1);
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+ conv_info);
- // Check if its a "fully connected" convolution
+ // Check if its a "fully connected" convolution, i.e. the output size is 1x1xnum_kernels
_is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+#if defined(__arm__)
+ if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+ {
+ _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
+ }
+#elif defined(__aarch64__)
+ if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
+ {
+ _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
+ }
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
unsigned int mat_weights_cols = weights->info()->dimension(3);
unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
// Reshape weights if needed
- if(_are_weights_reshaped)
+ if(_mm_optimised_kernel != nullptr)
{
- mat_weights_cols = output->info()->dimension(2);
- const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
- mat_weights_rows = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
- }
- else
- {
- if(_is_fully_connected_convolution)
+ if(_are_weights_reshaped)
{
- // Create tensor to store the reshaped weights
- TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
- TensorInfo info_wr(shape_wr, 1, dt, fixed_point_position);
- _weights_reshaped.allocator()->init(info_wr);
- _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+ mat_weights_cols = weights_info.num_kernels();
+ mat_weights_rows = weights->info()->dimension(1);
}
else
{
- // Create tensor to store transposed weights
- const float transpose_width = 16.0f / input->info()->element_size();
- TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
- TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
- _weights_reshaped.allocator()->init(info_wt);
- _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
+ TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+
+ // Create tensor to store the reshaped weights
+ _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+ weights = &_weights_reshaped;
}
- weights = &_weights_reshaped;
+ }
+ else
+ {
+ if(_are_weights_reshaped)
+ {
+ mat_weights_cols = weights_info.num_kernels();
+ mat_weights_rows = weights->info()->dimension(0) / 4 + (_has_bias ? 1 : 0);
+ }
+ else
+ {
+ TensorShape reshaped_weights_shape;
+
+ if(_is_fully_connected_convolution)
+ {
+ reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
+ }
+ else
+ {
+ // Create tensor to store transposed weights
+ const float transpose_width = 16.0f / input->info()->element_size();
+ reshaped_weights_shape = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
+ static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
+ }
+
+ // Create tensor to store the reshaped weights
+ _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, !_is_fully_connected_convolution /* 1xW transpose */);
+ weights = &_weights_reshaped;
+ }
}
// Create tensor to store im2col reshaped inputs
const unsigned int mat_input_cols = mat_weights_rows;
const unsigned int mat_input_rows = conv_w * conv_h;
- TensorShape shape_im2col = input->info()->tensor_shape();
+
+ TensorShape shape_im2col(input->info()->tensor_shape());
shape_im2col.set(0, mat_input_cols);
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
_input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+ _memory_group.manage(&_input_im2col_reshaped);
// Create tensor (interleave) to prepare input tensor for GEMM
- if(!_is_fully_connected_convolution)
+ if(!_is_fully_connected_convolution && _mm_optimised_kernel == nullptr)
{
- TensorShape shape_interleaved = shape_im2col;
+ TensorShape shape_interleaved(shape_im2col);
shape_interleaved.set(0, shape_interleaved.x() * 4);
shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
_input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+ _memory_group.manage(&_input_interleaved_reshaped);
}
// Create GEMM output tensor
- TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+ TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, mat_input_rows);
_gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
+ _memory_group.manage(&_gemm_output);
// Configure kernels
- _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
- if(_is_fully_connected_convolution)
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
+
+#if defined(__arm__) || defined(__aarch64__)
+ if(_mm_optimised_kernel != nullptr)
{
- _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+ struct CPUInfo ci = NEScheduler::get().cpu_info();
+
+ const int M = _gemm_output.info()->tensor_shape().y();
+ const int N = _gemm_output.info()->tensor_shape().x();
+ const int K = _input_im2col_reshaped.info()->tensor_shape().x();
+
+#if defined(__arm__)
+ GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+#elif defined(__aarch64__)
+ GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+ constexpr size_t alignment = 4096;
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+ _memory_group.manage(&_workspace);
+
+ // Configure matrix multiplication kernel
+ if(_is_fully_connected_convolution)
+ {
+ _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f, false, false);
+ }
+ else
+ {
+ _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
+ }
+
+ _workspace.allocator()->allocate();
}
else
+#endif /* defined(__arm__) || defined(__aarch64__) */
{
- _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
- _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+ if(_is_fully_connected_convolution)
+ {
+ _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+ }
+ else
+ {
+ _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+ _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+ _input_interleaved_reshaped.allocator()->allocate();
+ }
}
+
+ _input_im2col_reshaped.allocator()->allocate();
_output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+ _gemm_output.allocator()->allocate();
+
+ ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
// Allocate intermediate tensor
if(!_are_weights_reshaped)
{
_weights_reshaped.allocator()->allocate();
}
- _input_im2col_reshaped.allocator()->allocate();
- if(!_is_fully_connected_convolution)
- {
- _input_interleaved_reshaped.allocator()->allocate();
- }
- _gemm_output.allocator()->allocate();
}
void NEConvolutionLayer::run()
@@ -230,17 +315,30 @@
_reshape_weights.run();
}
+ _memory_group.acquire();
+
// Run input reshaping
NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
- if(!_is_fully_connected_convolution)
- {
- // Run interleave
- NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
- }
// Runs matrix multiply on reshaped matrices
- NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+ if(_mm_optimised_kernel != nullptr)
+ {
+ NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+ }
+ else
+ {
+ if(!_is_fully_connected_convolution)
+ {
+ // Run interleave
+ NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
+ }
+
+ NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+ }
// Reshape output matrix
NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+
+ _memory_group.release();
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
index 7d2c549..ddf7e90 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenate.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
@@ -24,28 +24,29 @@
#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-NEDepthConcatenate::NEDepthConcatenate()
- : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+NEDepthConcatenate::NEDepthConcatenate() // NOLINT
+ : _inputs_vector(),
+ _concat_kernels_vector(),
+ _border_handlers_vector(),
+ _num_inputs(0)
{
}
-void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output) // NOLINT
{
ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
_num_inputs = inputs_vector.size();
- _concat_kernels_vector = arm_compute::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
- _border_handlers_vector = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+ _concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
+ _border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
unsigned int depth_offset = 0;
for(unsigned int i = 0; i < _num_inputs; ++i)
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvert.cpp
index a339cae..37857b6 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvert.cpp
@@ -23,22 +23,16 @@
*/
#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
using namespace arm_compute;
-void NEDepthConvert::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
- ARM_COMPUTE_ERROR_ON(input == output);
- ARM_COMPUTE_ERROR_ON(input->info()->data_type() == output->info()->data_type());
-
- auto k = arm_compute::cpp14::make_unique<NEDepthConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
k->configure(input, output, policy, shift);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
new file mode 100644
index 0000000..a58b6e4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEDequantizationLayer::NEDequantizationLayer()
+ : _dequantize_kernel()
+{
+}
+
+void NEDequantizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
+{
+ // Configure kernel
+ _dequantize_kernel.configure(input, output, min_max);
+}
+
+void NEDequantizationLayer::run()
+{
+ NEScheduler::get().schedule(&_dequantize_kernel, Window::DimY);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp
index 2887c13..8118030 100644
--- a/src/runtime/NEON/functions/NEDerivative.cpp
+++ b/src/runtime/NEON/functions/NEDerivative.cpp
@@ -42,11 +42,11 @@
ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
_kernel.configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
- _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
}
void NEDerivative::run()
{
- _border_handler.run(_border_handler.window());
+ NEScheduler::get().schedule(&_border_handler, Window::DimZ);
NEScheduler::get().schedule(&_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEDilate.cpp b/src/runtime/NEON/functions/NEDilate.cpp
index 0c016f1..5c733a8 100644
--- a/src/runtime/NEON/functions/NEDilate.cpp
+++ b/src/runtime/NEON/functions/NEDilate.cpp
@@ -23,9 +23,9 @@
*/
#include "arm_compute/runtime/NEON/functions/NEDilate.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void NEDilate::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<NEDilateKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEDilateKernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 3f3e771..b831a6a 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -33,15 +33,13 @@
using namespace arm_compute;
-NEDirectConvolutionLayer::NEDirectConvolutionLayer()
- : _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
+NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
{
}
void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
-
// Free accumulator
if(_accumulator.buffer() != nullptr)
{
@@ -49,17 +47,38 @@
}
// Allocate the intermediate accumulator tensor in case of fixed point input
- if(output->info()->data_type() == DataType::QS8)
+ switch(output->info()->data_type())
{
- _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
- _conv_kernel.configure(input, weights, &_accumulator, conv_info);
- _accumulate_bias_kernel.configure(&_accumulator, bias, output);
- _accumulator.allocator()->allocate();
- }
- else
- {
- _conv_kernel.configure(input, weights, output, conv_info);
- _accumulate_bias_kernel.configure(output, bias);
+ case DataType::QS8:
+ {
+ _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
+ _memory_group.manage(&_accumulator);
+ _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+ _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+ _accumulator.allocator()->allocate();
+ break;
+ }
+ case DataType::QS16:
+ {
+ _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS32, output->info()->fixed_point_position()));
+ _memory_group.manage(&_accumulator);
+ _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+ _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+ _accumulator.allocator()->allocate();
+ break;
+ }
+ case DataType::F16:
+ case DataType::F32:
+ {
+ _conv_kernel.configure(input, weights, output, conv_info);
+ _accumulate_bias_kernel.configure(output, bias);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
}
// Add zero padding XY
@@ -68,8 +87,12 @@
void NEDirectConvolutionLayer::run()
{
- _input_border_handler.run(_input_border_handler.window());
+ NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
+
+ _memory_group.acquire();
NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
index f6ec677..70b93ca 100644
--- a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
+++ b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
@@ -55,7 +55,7 @@
NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
// Calculate cumulative distribution of histogram and create LUT.
- _cd_histogram_kernel.run(_cd_histogram_kernel.window());
+ NEScheduler::get().schedule(&_cd_histogram_kernel, Window::DimY);
// Map input to output using created LUT.
NEScheduler::get().schedule(&_map_histogram_kernel, Window::DimY);
diff --git a/src/runtime/NEON/functions/NEErode.cpp b/src/runtime/NEON/functions/NEErode.cpp
index 9b011db..3609572 100644
--- a/src/runtime/NEON/functions/NEErode.cpp
+++ b/src/runtime/NEON/functions/NEErode.cpp
@@ -23,9 +23,9 @@
*/
#include "arm_compute/runtime/NEON/functions/NEErode.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void NEErode::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<NEErodeKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEErodeKernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index 33a58f1..4137b1d 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -35,8 +35,9 @@
using namespace arm_compute;
-NEFastCorners::NEFastCorners()
- : _fast_corners_kernel(),
+NEFastCorners::NEFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)),
+ _fast_corners_kernel(),
_border_handler(),
_nonmax_kernel(),
_fill_kernel(),
@@ -59,6 +60,7 @@
TensorInfo tensor_info(input->info()->tensor_shape(), Format::U8);
_output.allocator()->init(tensor_info);
+ _memory_group.manage(&_output);
// If border is UNDEFINED _fast_corners_kernel will operate in xwindow (3,
// width - 3) and ywindow (3, height -3) so the output image will leave the
@@ -75,6 +77,7 @@
else
{
_suppressed.allocator()->init(tensor_info);
+ _memory_group.manage(&_suppressed);
_nonmax_kernel.configure(&_output, &_suppressed, BorderMode::UNDEFINED == border_mode);
_fill_kernel.configure(&_suppressed, 1 /* we keep all texels >0 */, corners);
@@ -88,7 +91,9 @@
void NEFastCorners::run()
{
- _border_handler.run(_border_handler.window());
+ NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+
+ _memory_group.acquire();
NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
@@ -98,4 +103,6 @@
}
NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index e884f4a..44e4952 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -30,7 +30,7 @@
void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
{
- _border_handler.configure(input, border_width, border_mode, constant_border_value);
+ _border_handler.configure(input, BorderSize(border_width), border_mode, constant_border_value);
}
void NEFillBorder::run()
diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
new file mode 100644
index 0000000..0000cdd
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFloor.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFloor.h"
+
+#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEFloor::configure(const ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEFloorKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index abb41e9..2e8d105 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -23,27 +23,28 @@
*/
#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include <algorithm>
#include <cmath>
-using namespace arm_compute;
-
-NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights()
- : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+namespace arm_compute
+{
+NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
{
}
void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 2);
ARM_COMPUTE_ERROR_ON(output == nullptr);
- ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
- ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+ ARM_COMPUTE_ERROR_ON(!transpose_weights && !is_batched_fc_layer);
- const DataType dt = input->info()->data_type();
+ const DataType data_type = input->info()->data_type();
const int fixed_point_position = input->info()->fixed_point_position();
_transpose_weights = transpose_weights;
@@ -56,7 +57,8 @@
{
// Initialize the output tensor for transpose
TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
- _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+ _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, data_type, fixed_point_position));
+ _memory_group.manage(&_transpose_output);
_transpose_kernel.configure(input, &_transpose_output);
// Configure transpose 1xW kernel
@@ -86,229 +88,161 @@
void NEFullyConnectedLayerReshapeWeights::run()
{
+ _memory_group.acquire();
+
if(_transpose_weights)
{
NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
}
+
if(_is_batched_fc_layer)
{
NEScheduler::get().schedule(&_transpose1xW_kernel, Window::DimY);
}
+
+ _memory_group.release();
}
-NEFullyConnectedLayer::NEFullyConnectedLayer()
- : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
- _are_weights_reshaped(false), _is_fc_after_conv(false), _is_batched_fc_layer(false), _accumulate_biases(false)
+NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(),
+ _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false)
{
}
-void NEFullyConnectedLayer::configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
-{
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
-
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
-
- // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
- // Initialize output tensor for im2col
- TensorShape shape_im2col;
- shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
- shape_im2col.set(1, input->info()->dimension(3));
- shape_im2col.set(2, input->info()->dimension(4));
- shape_im2col.set(3, input->info()->dimension(5));
- _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
-
- // Initialize output tensor for interleave 4x4
- TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
-
- // Configure im2col kernel
- _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
- // Configure interleave4x4 kernel
- _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
-
- // Configure matrix multiply kernel
- _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
-
- // Allocate the tensors once all the configure methods have been called
- _im2col_output.allocator()->allocate();
- _interleave4x4_output.allocator()->allocate();
-}
-
-void NEFullyConnectedLayer::configure_fc_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
-{
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
-
- // Initialize output tensor for interleave 4x4
- TensorShape shape_interleaved = input->info()->tensor_shape();
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
-
- // Configure interleave4x4 kernel
- _interleave4x4_kernel.configure(input, &_interleave4x4_output);
-
- // Configure matrix multiply kernel
- _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
-
- // Allocate the tensors once all the configure methods have been called
- _interleave4x4_output.allocator()->allocate();
-}
-
-void NEFullyConnectedLayer::configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
-{
- ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
-
- // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
- // Initialize output tensor for im2col
- TensorShape shape_im2col;
- shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
- shape_im2col.set(1, 1);
- _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
-
- // Configure im2col kernel
- _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
- // Configure matrix multiply kernel
- _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
-
- // Allocate the output tensor for im2col once all the configure methods have been called
- _im2col_output.allocator()->allocate();
-}
-
-void NEFullyConnectedLayer::configure_fc_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
-{
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
-
- // Configure matrix multiply kernel
- _mm_kernel.configure(input, weights, output, 1.0f);
-}
-
void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights, bool are_weights_reshaped)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
-
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
-
- _are_weights_reshaped = are_weights_reshaped;
- _is_fc_after_conv = true;
- _is_batched_fc_layer = false;
- _accumulate_biases = false;
-
- if(biases != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-
- _accumulate_biases = true;
-
- // Configure accumulate biases kernel
- _accumulate_biases_kernel.configure(output, biases);
- }
-
// With the Fully Connected layer we can have 4 different cases:
// 1) Convolution layer -> Fully Connected layer without batches
// 2) Fully Connected layer -> Fully Connected layer without batches
// 3) Convolution layer -> Fully Connected layer with batches
// 4) Fully Connected layer -> Fully Connected layer with batches
- // Check if we have a fully connected layer with batches
- _is_batched_fc_layer = (output->info()->dimension(1) > 1);
+ // Expected shape before transpose and reshaping
+ // Input: In x B (In and B can be multi-dimensional)
+ // Weights: flat(In) x Out
+ // Biases: Out
+ // Output: Out x B (B can be multi-dimensional)
- const ITensor *weights_to_use = weights;
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, weights, output);
- if(!are_weights_reshaped)
+ const DataType data_type = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+ const int num_batch_dimensions = std::max(0, static_cast<int>(output->info()->tensor_shape().num_dimensions()) - 1);
+ const int num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
+ const size_t linear_input_size = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
+
+ _linearize_input = input->info()->tensor_shape().x() != linear_input_size;
+ _are_weights_reshaped = are_weights_reshaped;
+ _accumulate_biases = biases != nullptr;
+ _is_batched_fc_layer = num_batch_dimensions > 0;
+
+ // Check if number of batches match
+ ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size_upper(num_input_dimensions) != output->info()->tensor_shape().total_size_upper(1));
+ ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
+
+ const size_t interleave_width = 16 / input->info()->element_size();
+ const ITensor *weights_to_use = weights;
+
+ if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
{
- if((transpose_weights || _is_batched_fc_layer))
+ weights_to_use = &_reshape_weights_output;
+
+ TensorShape reshaped_weights_shape(weights->info()->tensor_shape());
+
+ // Transpose weights if the user hasn't done it
+ if(transpose_weights)
{
- weights_to_use = &_reshape_weights_output;
-
- if(transpose_weights)
- {
- if(_is_batched_fc_layer)
- {
- const float transpose_width = 16.0f / input->info()->element_size();
- TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
- TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
- _reshape_weights_output.allocator()->init(info_wt);
- }
- else
- {
- TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
- TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
- _reshape_weights_output.allocator()->init(info_wt);
- }
- }
- else
- {
- ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
-
- const float transpose_width = 16.0f / input->info()->element_size();
- TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
- TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
- _reshape_weights_output.allocator()->init(info_wt);
- }
-
- // Reshape the weights
- _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+ const size_t shape_x = reshaped_weights_shape.x();
+ reshaped_weights_shape.set(0, reshaped_weights_shape.y());
+ reshaped_weights_shape.set(1, shape_x);
}
+
+ // If the we run multiple batches we need 1xW transpose, too.
+ if(_is_batched_fc_layer)
+ {
+ const float shape_x = reshaped_weights_shape.x();
+ reshaped_weights_shape.set(0, reshaped_weights_shape.y() * interleave_width);
+ reshaped_weights_shape.set(1, static_cast<unsigned int>(std::ceil(shape_x / interleave_width)));
+ }
+
+ _reshape_weights_output.allocator()->init(TensorInfo(reshaped_weights_shape, 1, data_type, fixed_point_position));
+
+ // Reshape the weights
+ _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+ }
+
+ // Check correct shape of weights
+ if(_is_batched_fc_layer)
+ {
+ // Transpose + Transpose1xW
+ ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().x() != linear_input_size * interleave_width);
+ ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().y() != static_cast<unsigned int>(std::ceil(static_cast<float>(output->info()->tensor_shape().x()) / interleave_width)));
+ }
+ else
+ {
+ // Transpose
+ ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().x() != output->info()->tensor_shape().x());
+ ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().y() != linear_input_size);
+ }
+
+ const ITensor *multiply_input = input;
+
+ if(_linearize_input)
+ {
+ TensorShape shape_im2col(input->info()->tensor_shape());
+ shape_im2col.collapse(num_input_dimensions);
+ _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, data_type, fixed_point_position));
+
+ // Configure im2col kernel
+ _memory_group.manage(&_im2col_output);
+ _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+ multiply_input = &_im2col_output;
}
if(_is_batched_fc_layer)
{
- _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
- input->info()->tensor_shape().cend(),
- output->info()->tensor_shape().cbegin() + 1));
+ TensorShape shape_interleaved(multiply_input->info()->tensor_shape());
+ shape_interleaved.set(0, shape_interleaved.x() * 4);
+ shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+ _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, data_type, fixed_point_position));
- if(_is_fc_after_conv)
- {
- // Fully Connected layer after a Convolution Layer with batches
- configure_conv_fc_wb(input, weights_to_use, output);
- }
- else
- {
- // Fully Connected layer after a Fully Connected Layer with batches
- configure_fc_fc_wb(input, weights_to_use, output);
- }
+ // Configure interleave4x4 kernel
+ _memory_group.manage(&_interleave4x4_output);
+ _interleave4x4_kernel.configure(multiply_input, &_interleave4x4_output);
+
+ multiply_input = &_interleave4x4_output;
}
- else
- {
- // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
- _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
- if(_is_fc_after_conv)
- {
- // Fully Connected layer after a Convolution Layer without batches
- configure_conv_fc_nb(input, weights_to_use, output);
- }
- else
- {
- // Fully Connected layer after a Fully Connected Layer without batches
- configure_fc_fc_nb(input, weights_to_use, output);
- }
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(multiply_input, weights_to_use, output, 1.0f);
+
+ if(_accumulate_biases)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_ERROR_ON(biases->info()->tensor_shape().x() != output->info()->tensor_shape().x());
+
+ // Configure accumulate biases kernel
+ _accumulate_biases_kernel.configure(output, biases);
}
// Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
- if(!are_weights_reshaped)
+ if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
{
- if(transpose_weights || _is_batched_fc_layer)
- {
- // Allocate the tensor for the weights reshaped
- _reshape_weights_output.allocator()->allocate();
- }
+ // Allocate the tensor for the weights reshaped
+ _reshape_weights_output.allocator()->allocate();
+ }
+
+ if(_linearize_input)
+ {
+ _im2col_output.allocator()->allocate();
+ }
+
+ if(_is_batched_fc_layer)
+ {
+ _interleave4x4_output.allocator()->allocate();
}
}
@@ -321,8 +255,10 @@
_reshape_weights_kernel.run();
}
- // Linearize input if comes from a convolutional layer
- if(_is_fc_after_conv)
+ _memory_group.acquire();
+
+ // Linearize input if it comes from a convolutional layer
+ if(_linearize_input)
{
NEScheduler::get().schedule(&_im2col_kernel, Window::DimY);
}
@@ -341,4 +277,7 @@
{
NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
}
+
+ _memory_group.release();
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 15d5f4e..ff92ef8 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -26,30 +26,41 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
#include <cmath>
-using namespace arm_compute;
-
-NEGEMM::NEGEMM()
- : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+namespace arm_compute
+{
+NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
+ _run_vector_matrix_multiplication(false), _run_addition(false)
{
}
void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16, DataType::QS8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(d, 1, DataType::F32, DataType::F16, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
+ ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
if(c != nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
@@ -57,100 +68,135 @@
ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
}
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
- ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
- // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
- if((a->info()->dimension(1) == 1))
+ // Check if the first input tensor is a vector.
+ // If so, all the kernels for reshaping the tensors can be skipped
+ if(_run_vector_matrix_multiplication)
{
- _run_vector_matrix_multiplication = true;
-
// Configure the matrix multiply kernel
_mm_kernel.configure(a, b, d, alpha);
+
+ // Configure matrix addition kernel
+ if(beta != 0 && c != nullptr)
+ {
+ _ma_kernel.configure(c, d, beta);
+ _run_addition = true;
+ }
}
else
{
- _run_vector_matrix_multiplication = false;
-
- TensorShape shape_tmp_a = a->info()->tensor_shape();
- TensorShape shape_tmp_b = b->info()->tensor_shape();
-
- shape_tmp_a.set(0, a->info()->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
-
- switch(a->info()->data_type())
+#if defined(__arm__)
+ if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
{
- case DataType::F32:
+ _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
+ }
+#elif defined(__aarch64__)
+ if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
+ {
+ _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
+ }
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+#if defined(__arm__) || defined(__aarch64__)
+ if(_mm_optimised_kernel != nullptr)
+ {
+ struct CPUInfo ci = NEScheduler::get().cpu_info();
+
+ const int M = d->info()->tensor_shape().y();
+ const int N = d->info()->tensor_shape().x();
+ const int K = a->info()->tensor_shape().x();
+
+#if defined(__arm__)
+ GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+#elif defined(__aarch64__)
+ GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+ constexpr size_t alignment = 4096;
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+ _memory_group.manage(&_workspace);
+
+ // Configure matrix multiplication kernel
+ _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f);
+
+ _workspace.allocator()->allocate();
+ }
+ else
+#endif /* defined(__arm__) || defined(__aarch64__) */
+ {
+ TensorShape shape_tmp_a = a->info()->tensor_shape();
+ TensorShape shape_tmp_b = b->info()->tensor_shape();
+
+ shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
+
+ const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type());
+ shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
+ shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
+
+ TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
+ TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
+
+ _tmp_a.allocator()->init(info_a);
+ _tmp_b.allocator()->init(info_b);
+
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp_a);
+ _memory_group.manage(&_tmp_b);
+
+ // Configure interleave kernel
+ _interleave_kernel.configure(a, &_tmp_a);
+
+ // Configure transpose kernel
+ _transpose_kernel.configure(b, &_tmp_b);
+
+ // Configure matrix multiplication kernel
+ _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
+
+ // Allocate once the all configure methods have been called
+ _tmp_a.allocator()->allocate();
+ _tmp_b.allocator()->allocate();
+
+ // Configure matrix addition kernel
+ if(beta != 0 && c != nullptr)
{
- shape_tmp_b.set(0, b->info()->dimension(1) * 4);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.0f));
- break;
- }
- case DataType::F16:
-#ifdef ARM_COMPUTE_ENABLE_FP16
- {
- shape_tmp_b.set(0, b->info()->dimension(1) * 8);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 8.0f));
- break;
- }
-#endif
- case DataType::QS8:
- {
- shape_tmp_b.set(0, b->info()->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.0f));
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR_ON("Data type not supported");
+ _ma_kernel.configure(c, d, beta);
+ _run_addition = true;
}
}
-
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
-
- _tmp_a.allocator()->init(info_a);
- _tmp_b.allocator()->init(info_b);
-
- // Configure interleave kernel
- _interleave_kernel.configure(a, &_tmp_a);
-
- // Configure transpose kernel
- _transpose_kernel.configure(b, &_tmp_b);
-
- // Configure matrix multiplication kernel
- _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
-
- // Allocate once the all configure methods have been called
- _tmp_a.allocator()->allocate();
- _tmp_b.allocator()->allocate();
- }
-
- // Configure matrix addition kernel
- if(beta != 0 && c != nullptr)
- {
- _ma_kernel.configure(c, d, beta);
- _run_addition = true;
}
}
void NEGEMM::run()
{
- if(!_run_vector_matrix_multiplication)
- {
- // Run interleave kernel
- NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
+ _memory_group.acquire();
- // Run transpose kernel
- NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+ if(_mm_optimised_kernel != nullptr)
+ {
+ NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+ _memory_group.release();
}
-
- // Run matrix multiply kernel
- NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
-
- // Run matrix addition kernel
- if(_run_addition)
+ else
{
- NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+ if(!_run_vector_matrix_multiplication)
+ {
+ // Run interleave kernel
+ NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
+
+ // Run transpose kernel
+ NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+ }
+
+ NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
+
+ _memory_group.release();
+
+ // Run matrix addition kernel
+ if(_run_addition)
+ {
+ NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+ }
}
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
index 4c77c88..63f330b 100644
--- a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
@@ -23,14 +23,14 @@
*/
#include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
k->configure(input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp
index b64f769..7413b28 100644
--- a/src/runtime/NEON/functions/NEGEMMLowp.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowp.cpp
@@ -34,8 +34,8 @@
using namespace arm_compute;
-NEGEMMLowp::NEGEMMLowp()
- : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+NEGEMMLowp::NEGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
{
}
@@ -63,6 +63,10 @@
_tmp_a.allocator()->init(info_a);
_tmp_b.allocator()->init(info_b);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp_a);
+ _memory_group.manage(&_tmp_b);
+
_interleave_kernel.configure(a, &_tmp_a);
_transpose_kernel.configure(b, &_tmp_b);
_mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
@@ -73,6 +77,8 @@
void NEGEMMLowp::run()
{
+ _memory_group.acquire();
+
/* Run interleave kernel */
NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
@@ -81,4 +87,6 @@
/* Run matrix multiply kernel */
NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
index dc40ece..571bf2b 100644
--- a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
@@ -24,17 +24,17 @@
#include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
void NEGEMMTranspose1xW::configure(const ITensor *input, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEGaussian3x3.cpp b/src/runtime/NEON/functions/NEGaussian3x3.cpp
index 95ba5cb..db8eb63 100644
--- a/src/runtime/NEON/functions/NEGaussian3x3.cpp
+++ b/src/runtime/NEON/functions/NEGaussian3x3.cpp
@@ -23,9 +23,9 @@
*/
#include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void NEGaussian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<NEGaussian3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEGaussian3x3Kernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index 5ccc765..b010ca0 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -32,17 +32,20 @@
using namespace arm_compute;
-NEGaussian5x5::NEGaussian5x5()
- : _kernel_hor(), _kernel_vert(), _tmp(), _border_handler()
+NEGaussian5x5::NEGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _tmp(), _border_handler()
{
}
void NEGaussian5x5::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
// Init temporary buffer
- TensorInfo tensor_info(input->info()->tensor_shape(), Format::S16);
+ TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16);
_tmp.allocator()->init(tensor_info);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp);
+
// Create and configure kernels for the two passes
_kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
_kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
@@ -54,7 +57,12 @@
void NEGaussian5x5::run()
{
- _border_handler.run(_border_handler.window());
+ NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+
+ _memory_group.acquire();
+
NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index e1d64f1..84ea0ca 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -24,7 +24,6 @@
#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
@@ -36,6 +35,7 @@
#include "arm_compute/runtime/Pyramid.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
#include <cstddef>
@@ -46,8 +46,10 @@
{
}
-NEGaussianPyramidHalf::NEGaussianPyramidHalf()
- : _border_handler(), _horizontal_reduction(), _vertical_reduction()
+NEGaussianPyramidHalf::NEGaussianPyramidHalf() // NOLINT
+ : _border_handler(),
+ _horizontal_reduction(),
+ _vertical_reduction()
{
}
@@ -68,9 +70,9 @@
if(num_levels > 1)
{
- _border_handler = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
- _horizontal_reduction = arm_compute::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
- _vertical_reduction = arm_compute::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
+ _border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
+ _horizontal_reduction = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
+ _vertical_reduction = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
// Apply half scale to the X dimension of the tensor shape
TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -107,14 +109,15 @@
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
- _border_handler[i].run(_border_handler[i].window());
+ NEScheduler::get().schedule(_border_handler.get() + i, Window::DimZ);
NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
}
}
-NEGaussianPyramidOrb::NEGaussianPyramidOrb()
- : _offsets(), _gaus5x5(), _scale_nearest()
+NEGaussianPyramidOrb::NEGaussianPyramidOrb() // NOLINT
+ : _gaus5x5(),
+ _scale_nearest()
{
}
@@ -135,30 +138,19 @@
if(num_levels > 1)
{
- _gaus5x5 = arm_compute::cpp14::make_unique<NEGaussian5x5[]>(num_levels - 1);
- _scale_nearest = arm_compute::cpp14::make_unique<NEScaleKernel[]>(num_levels - 1);
- _offsets = arm_compute::cpp14::make_unique<Image[]>(num_levels - 1);
+ _gaus5x5 = arm_compute::support::cpp14::make_unique<NEGaussian5x5[]>(num_levels - 1);
+ _scale_nearest = arm_compute::support::cpp14::make_unique<NEScale[]>(num_levels - 1);
PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
_tmp.init(pyramid_info);
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
- const size_t width = _pyramid->get_pyramid_level(i + 1)->info()->dimension(0);
- const size_t height = _pyramid->get_pyramid_level(i + 1)->info()->dimension(1);
-
- /* Allocate Image for the offsets used by NEAREST interpolation */
- TensorInfo tensor_info(TensorShape(width, height), Format::S32);
- _offsets[i].allocator()->init(tensor_info);
-
/* Configure gaussian 5x5 */
_gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
- /* Configure scale image kernel */
- _scale_nearest[i].configure(_tmp.get_pyramid_level(i), nullptr, nullptr, _offsets.get() + i, _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR,
- border_mode == BorderMode::UNDEFINED);
-
- _offsets[i].allocator()->allocate();
+ /* Configure scale */
+ _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
}
_tmp.allocate();
@@ -178,6 +170,6 @@
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
_gaus5x5[i].run();
- NEScheduler::get().schedule(_scale_nearest.get() + i, Window::DimY);
+ _scale_nearest[i].run();
}
}
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index a592f53..5e98269 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -31,8 +31,8 @@
using namespace arm_compute;
-NEHOGDescriptor::NEHOGDescriptor()
- : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+NEHOGDescriptor::NEHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
{
}
@@ -71,9 +71,16 @@
TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
_hog_space.allocator()->init(info_space);
+ // Manage intermediate buffers
+ _memory_group.manage(&_mag);
+ _memory_group.manage(&_phase);
+
// Initialise gradient kernel
_gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+ // Manage intermediate buffers
+ _memory_group.manage(&_hog_space);
+
// Initialise orientation binning kernel
_orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
@@ -88,6 +95,8 @@
void NEHOGDescriptor::run()
{
+ _memory_group.acquire();
+
// Run gradient
_gradient.run();
@@ -96,4 +105,6 @@
// Run block normalization kernel
NEScheduler::get().schedule(&_block_norm, Window::DimY);
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEHOGDetector.cpp b/src/runtime/NEON/functions/NEHOGDetector.cpp
index e8ed29d..49d0778 100644
--- a/src/runtime/NEON/functions/NEHOGDetector.cpp
+++ b/src/runtime/NEON/functions/NEHOGDetector.cpp
@@ -23,14 +23,14 @@
*/
#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
void NEHOGDetector::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
{
- auto k = arm_compute::cpp14::make_unique<NEHOGDetectorKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEHOGDetectorKernel>();
k->configure(input, hog, detection_windows, detection_window_stride, threshold, idx_class);
_kernel = std::move(k);
}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index 2f4b880..efc8690 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -23,15 +23,19 @@
*/
#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-NEHOGGradient::NEHOGGradient()
- : _derivative(), _mag_phase(nullptr), _gx(), _gy()
+NEHOGGradient::NEHOGGradient(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _derivative(),
+ _mag_phase(nullptr),
+ _gx(),
+ _gy()
{
}
@@ -48,19 +52,23 @@
_gx.allocator()->init(info);
_gy.allocator()->init(info);
+ // Manage intermediate buffers
+ _memory_group.manage(&_gx);
+ _memory_group.manage(&_gy);
+
// Initialise derivate kernel
_derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
// Initialise magnitude/phase kernel
if(PhaseType::UNSIGNED == phase_type)
{
- auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>();
k->configure(&_gx, &_gy, output_magnitude, output_phase);
_mag_phase = std::move(k);
}
else
{
- auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
k->configure(&_gx, &_gy, output_magnitude, output_phase);
_mag_phase = std::move(k);
}
@@ -72,9 +80,13 @@
void NEHOGGradient::run()
{
+ _memory_group.acquire();
+
// Run derivative
_derivative.run();
// Run magnitude/phase kernel
NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index 173b8f4..8c834e2 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -24,16 +24,30 @@
#include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-NEHOGMultiDetection::NEHOGMultiDetection()
- : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
- _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+NEHOGMultiDetection::NEHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _gradient_kernel(),
+ _orient_bin_kernel(),
+ _block_norm_kernel(),
+ _hog_detect_kernel(),
+ _non_maxima_kernel(),
+ _hog_space(),
+ _hog_norm_space(),
+ _detection_windows(),
+ _mag(),
+ _phase(),
+ _non_maxima_suppression(false),
+ _num_orient_bin_kernel(0),
+ _num_block_norm_kernel(0),
+ _num_hog_detect_kernel(0)
{
}
@@ -112,12 +126,12 @@
_num_block_norm_kernel = input_block_norm.size(); // Number of NEHOGBlockNormalizationKernel kernels to compute
_num_hog_detect_kernel = input_hog_detect.size(); // Number of NEHOGDetector functions to compute
- _orient_bin_kernel = arm_compute::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
- _block_norm_kernel = arm_compute::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
- _hog_detect_kernel = arm_compute::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
- _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
- _hog_space = arm_compute::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
- _hog_norm_space = arm_compute::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
+ _orient_bin_kernel = arm_compute::support::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+ _block_norm_kernel = arm_compute::support::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+ _hog_detect_kernel = arm_compute::support::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
+ _non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+ _hog_space = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
+ _hog_norm_space = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
// Allocate tensors for magnitude and phase
TensorInfo info_mag(shape_img, Format::S16);
@@ -126,6 +140,10 @@
TensorInfo info_phase(shape_img, Format::U8);
_phase.allocator()->init(info_phase);
+ // Manage intermediate buffers
+ _memory_group.manage(&_mag);
+ _memory_group.manage(&_phase);
+
// Initialise gradient kernel
_gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
@@ -151,10 +169,17 @@
TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
_hog_space[i].allocator()->init(info_space);
+ // Manage intermediate buffers
+ _memory_group.manage(_hog_space.get() + i);
+
// Initialise orientation binning kernel
_orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
}
+ // Allocate intermediate tensors
+ _mag.allocator()->allocate();
+ _phase.allocator()->allocate();
+
// Configure NETensor for the normalized HOG space and block normalization kernel
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
{
@@ -165,10 +190,19 @@
TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
_hog_norm_space[i].allocator()->init(tensor_info);
+ // Manage intermediate buffers
+ _memory_group.manage(_hog_norm_space.get() + i);
+
// Initialize block normalization kernel
_block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
}
+ // Allocate intermediate tensors
+ for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+ {
+ _hog_space[i].allocator()->allocate();
+ }
+
// Configure HOG detector kernel
for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
{
@@ -181,14 +215,6 @@
_non_maxima_kernel->configure(_detection_windows, min_distance);
// Allocate intermediate tensors
- _mag.allocator()->allocate();
- _phase.allocator()->allocate();
-
- for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
- {
- _hog_space[i].allocator()->allocate();
- }
-
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
{
_hog_norm_space[i].allocator()->allocate();
@@ -199,6 +225,8 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
+ _memory_group.acquire();
+
// Reset detection window
_detection_windows->clear();
@@ -226,6 +254,8 @@
// Run non-maxima suppression kernel if enabled
if(_non_maxima_suppression)
{
- _non_maxima_kernel->run(_non_maxima_kernel->window());
+ NEScheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
}
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index b54fb67..25e28d2 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -24,7 +24,6 @@
#include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
#include "arm_compute/core/TensorInfo.h"
@@ -35,14 +34,28 @@
#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
#include <cmath>
#include <utility>
using namespace arm_compute;
-NEHarrisCorners::NEHarrisCorners()
- : _sobel(), _harris_score(), _non_max_suppr(), _candidates(), _sort_euclidean(), _border_gx(), _border_gy(), _gx(), _gy(), _score(), _nonmax(), _corners_list(), _num_corner_candidates(0)
+NEHarrisCorners::NEHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _sobel(),
+ _harris_score(),
+ _non_max_suppr(),
+ _candidates(),
+ _sort_euclidean(),
+ _border_gx(),
+ _border_gy(),
+ _gx(),
+ _gy(),
+ _score(),
+ _nonmax(),
+ _corners_list(),
+ _num_corner_candidates(0)
{
}
@@ -69,32 +82,36 @@
_gx.allocator()->init(tensor_info_gxgy);
_gy.allocator()->init(tensor_info_gxgy);
+ // Manage intermediate buffers
+ _memory_group.manage(&_gx);
+ _memory_group.manage(&_gy);
+
TensorInfo tensor_info_score(shape, Format::F32);
_score.allocator()->init(tensor_info_score);
_nonmax.allocator()->init(tensor_info_score);
- _corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+ _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
// Set/init Sobel kernel accordingly with gradient_size
switch(gradient_size)
{
case 3:
{
- auto k = arm_compute::cpp14::make_unique<NESobel3x3>();
+ auto k = arm_compute::support::cpp14::make_unique<NESobel3x3>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
break;
}
case 5:
{
- auto k = arm_compute::cpp14::make_unique<NESobel5x5>();
+ auto k = arm_compute::support::cpp14::make_unique<NESobel5x5>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
break;
}
case 7:
{
- auto k = arm_compute::cpp14::make_unique<NESobel7x7>();
+ auto k = arm_compute::support::cpp14::make_unique<NESobel7x7>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
break;
@@ -106,27 +123,30 @@
// Normalization factor
const float norm_factor = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
+ // Manage intermediate buffers
+ _memory_group.manage(&_score);
+
if(use_fp16)
{
switch(block_size)
{
case 3:
{
- auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<3>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<3>>();
k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
_harris_score = std::move(k);
}
break;
case 5:
{
- auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<5>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<5>>();
k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
_harris_score = std::move(k);
}
break;
case 7:
{
- auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<7>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<7>>();
k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
_harris_score = std::move(k);
}
@@ -141,21 +161,21 @@
{
case 3:
{
- auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<3>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<3>>();
k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
_harris_score = std::move(k);
}
break;
case 5:
{
- auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<5>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<5>>();
k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
_harris_score = std::move(k);
}
break;
case 7:
{
- auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<7>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<7>>();
k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
_harris_score = std::move(k);
}
@@ -168,26 +188,35 @@
_border_gx.configure(&_gx, _harris_score->border_size(), border_mode, constant_border_value);
_border_gy.configure(&_gy, _harris_score->border_size(), border_mode, constant_border_value);
+ // Allocate once all the configure methods have been called
+ _gx.allocator()->allocate();
+ _gy.allocator()->allocate();
+
+ // Manage intermediate buffers
+ _memory_group.manage(&_nonmax);
+
// Init non-maxima suppression function
_non_max_suppr.configure(&_score, &_nonmax, border_mode);
+ // Allocate once all the configure methods have been called
+ _score.allocator()->allocate();
+
// Init corner candidates kernel
_candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+ // Allocate once all the configure methods have been called
+ _nonmax.allocator()->allocate();
+
// Init euclidean distance
_sort_euclidean.configure(_corners_list.get(), corners, &_num_corner_candidates, min_dist);
-
- // Allocate once all the configure methods have been called
- _gx.allocator()->allocate();
- _gy.allocator()->allocate();
- _score.allocator()->allocate();
- _nonmax.allocator()->allocate();
}
void NEHarrisCorners::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
+ _memory_group.acquire();
+
// Init to 0 number of corner candidates
_num_corner_candidates = 0;
@@ -195,8 +224,8 @@
_sobel->run();
// Fill border before harris score kernel
- _border_gx.run(_border_gx.window());
- _border_gy.run(_border_gy.window());
+ NEScheduler::get().schedule(&_border_gx, Window::DimZ);
+ NEScheduler::get().schedule(&_border_gy, Window::DimZ);
// Run harris score kernel
NEScheduler::get().schedule(_harris_score.get(), Window::DimY);
@@ -208,5 +237,7 @@
NEScheduler::get().schedule(&_candidates, Window::DimY);
// Run sort & euclidean distance
- _sort_euclidean.run(_sort_euclidean.window());
+ NEScheduler::get().schedule(&_sort_euclidean, Window::DimY);
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
index c42b2a5..f333ecb 100644
--- a/src/runtime/NEON/functions/NEHistogram.cpp
+++ b/src/runtime/NEON/functions/NEHistogram.cpp
@@ -24,17 +24,17 @@
#include "arm_compute/runtime/NEON/functions/NEHistogram.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IDistribution1D.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
NEHistogram::NEHistogram()
- : _histogram_kernel(), _local_hist(), _window_lut(arm_compute::cpp14::make_unique<uint32_t[]>(window_lut_default_size)), _local_hist_size(0)
+ : _histogram_kernel(), _local_hist(), _window_lut(arm_compute::support::cpp14::make_unique<uint32_t[]>(window_lut_default_size)), _local_hist_size(0)
{
}
@@ -45,7 +45,7 @@
// Allocate space for threads local histograms
_local_hist_size = output->num_bins() * NEScheduler::get().num_threads();
- _local_hist = arm_compute::cpp14::make_unique<uint32_t[]>(_local_hist_size);
+ _local_hist = arm_compute::support::cpp14::make_unique<uint32_t[]>(_local_hist_size);
// Configure kernel
_histogram_kernel.configure(input, output, _local_hist.get(), _window_lut.get());
diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
index af604e9..2e94ed5 100644
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp
@@ -23,9 +23,9 @@
*/
#include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void NEIntegralImage::configure(const ITensor *input, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NEIntegralImageKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>();
k->configure(input, output);
_kernel = std::move(k);
_border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, 0);
diff --git a/src/runtime/NEON/functions/NEL2Normalize.cpp b/src/runtime/NEON/functions/NEL2Normalize.cpp
new file mode 100644
index 0000000..349a781
--- /dev/null
+++ b/src/runtime/NEON/functions/NEL2Normalize.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEL2Normalize.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEL2Normalize::NEL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
+{
+}
+
+void NEL2Normalize::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
+{
+ // Manage intermediate buffers
+ _memory_group.manage(&_sumsq);
+
+ // Configure Kernels
+ _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
+ _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
+
+ // Allocate intermediate tensors
+ _sumsq.allocator()->allocate();
+}
+
+void NEL2Normalize::run()
+{
+ _memory_group.acquire();
+
+ _reduce_func.run();
+ NEScheduler::get().schedule(&_normalize_kernel, Window::DimY);
+
+ _memory_group.release();
+}
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index 8232c79..a680f1f 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -24,7 +24,6 @@
#include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IPyramid.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
@@ -33,11 +32,18 @@
#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-NELaplacianPyramid::NELaplacianPyramid()
- : _num_levels(0), _gaussian_pyr_function(), _convf(), _subf(), _gauss_pyr(), _conv_pyr(), _depth_function()
+NELaplacianPyramid::NELaplacianPyramid() // NOLINT
+ : _num_levels(0),
+ _gaussian_pyr_function(),
+ _convf(),
+ _subf(),
+ _gauss_pyr(),
+ _conv_pyr(),
+ _depth_function()
{
}
@@ -86,8 +92,8 @@
// Create Gaussian Pyramid function
_gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
- _convf = arm_compute::cpp14::make_unique<NEGaussian5x5[]>(_num_levels);
- _subf = arm_compute::cpp14::make_unique<NEArithmeticSubtraction[]>(_num_levels);
+ _convf = arm_compute::support::cpp14::make_unique<NEGaussian5x5[]>(_num_levels);
+ _subf = arm_compute::support::cpp14::make_unique<NEArithmeticSubtraction[]>(_num_levels);
for(unsigned int i = 0; i < _num_levels; ++i)
{
diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
index 36ac4a7..0893701 100644
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
@@ -24,18 +24,21 @@
#include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IPyramid.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
#include <cstddef>
using namespace arm_compute;
-NELaplacianReconstruct::NELaplacianReconstruct()
- : _tmp_pyr(), _addf(), _scalef(), _depthf()
+NELaplacianReconstruct::NELaplacianReconstruct() // NOLINT
+ : _tmp_pyr(),
+ _addf(),
+ _scalef(),
+ _depthf()
{
}
@@ -61,8 +64,8 @@
_tmp_pyr.init(pyramid_info);
// Allocate add and scale functions. Level 0 does not need to be scaled.
- _addf = arm_compute::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
- _scalef = arm_compute::cpp14::make_unique<NEScale[]>(num_levels - 1);
+ _addf = arm_compute::support::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
+ _scalef = arm_compute::support::cpp14::make_unique<NEScale[]>(num_levels - 1);
const size_t last_level = num_levels - 1;
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 85d7ba3..cb48598 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -33,8 +33,9 @@
using namespace arm_compute;
-NELocallyConnectedLayer::NELocallyConnectedLayer()
- : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+ _is_first_run(false)
{
}
@@ -65,11 +66,14 @@
std::tie(stride_x, stride_y) = conv_info.stride();
std::tie(pad_x, pad_y) = conv_info.pad();
+ const unsigned int kernel_width = weights->info()->dimension(0);
+ const unsigned int kernel_height = weights->info()->dimension(1);
+
// Get convolved dimensions
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
- stride_x, stride_y, pad_x, pad_y, conv_info.round());
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+ conv_info);
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
@@ -99,8 +103,12 @@
shape_gemm.set(1, mat_input_rows);
_gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+ // Manage intermediate buffers
+ _memory_group.manage(&_input_im2col_reshaped);
+ _memory_group.manage(&_gemm_output);
+
// Configure kernels
- _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
_weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
_mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
_output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
@@ -120,6 +128,8 @@
NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
}
+ _memory_group.acquire();
+
// Run input reshaping
NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
@@ -128,4 +138,6 @@
// Reshape output matrix
NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
index 9390ca2..7877995 100644
--- a/src/runtime/NEON/functions/NEMagnitude.cpp
+++ b/src/runtime/NEON/functions/NEMagnitude.cpp
@@ -23,9 +23,9 @@
*/
#include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -35,13 +35,13 @@
{
if(use_fp16)
{
- auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
k->configure(input1, input2, output, nullptr);
_kernel = std::move(k);
}
else
{
- auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
k->configure(input1, input2, output, nullptr);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp
index 47143f5..2304bc8 100644
--- a/src/runtime/NEON/functions/NEMeanStdDev.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDev.cpp
@@ -23,19 +23,19 @@
*/
#include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
using namespace arm_compute;
NEMeanStdDev::NEMeanStdDev()
- : _mean_stddev_kernel(), _global_sum(0), _global_sum_squared(0)
+ : _mean_stddev_kernel(), _fill_border_kernel(), _global_sum(0), _global_sum_squared(0)
{
}
-void NEMeanStdDev::configure(const IImage *input, float *mean, float *stddev)
+void NEMeanStdDev::configure(IImage *input, float *mean, float *stddev)
{
_mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+ _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
}
void NEMeanStdDev::run()
@@ -43,5 +43,6 @@
_global_sum = 0;
_global_sum_squared = 0;
+ NEScheduler::get().schedule(&_fill_border_kernel, Window::DimZ);
NEScheduler::get().schedule(&_mean_stddev_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEMedian3x3.cpp b/src/runtime/NEON/functions/NEMedian3x3.cpp
index aa7cc97..627e5fb 100644
--- a/src/runtime/NEON/functions/NEMedian3x3.cpp
+++ b/src/runtime/NEON/functions/NEMedian3x3.cpp
@@ -23,9 +23,9 @@
*/
#include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void NEMedian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<NEMedian3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEMedian3x3Kernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NEMinMaxLocation.cpp b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
index cab9200..54e89ab 100644
--- a/src/runtime/NEON/functions/NEMinMaxLocation.cpp
+++ b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
@@ -32,7 +32,7 @@
{
}
-void NEMinMaxLocation::configure(const IImage *input, int32_t *min, int32_t *max, ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
+void NEMinMaxLocation::configure(const IImage *input, void *min, void *max, ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
{
_min_max.configure(input, min, max);
_min_max_loc.configure(input, min, max, min_loc, max_loc, min_count, max_count);
diff --git a/src/runtime/NEON/functions/NENonLinearFilter.cpp b/src/runtime/NEON/functions/NENonLinearFilter.cpp
index 01aea3b..57bd4e7 100644
--- a/src/runtime/NEON/functions/NENonLinearFilter.cpp
+++ b/src/runtime/NEON/functions/NENonLinearFilter.cpp
@@ -23,9 +23,9 @@
*/
#include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -35,7 +35,7 @@
BorderMode border_mode,
uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<NENonLinearFilterKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NENonLinearFilterKernel>();
k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
index a7b3759..3b59820 100644
--- a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
+++ b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,16 +32,16 @@
void NENonMaximaSuppression3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode)
{
- auto k = arm_compute::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
if(border_mode != BorderMode::UNDEFINED)
{
- _border_handler.configure(input, 1, BorderMode::CONSTANT, 0);
+ _border_handler.configure(input, BorderSize(1), BorderMode::CONSTANT, 0);
}
else
{
- _border_handler.configure(input, 1, BorderMode::UNDEFINED, 0);
+ _border_handler.configure(input, BorderSize(1), BorderMode::UNDEFINED, 0);
}
}
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index 69ff325..e01ef66 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -32,8 +32,8 @@
using namespace arm_compute;
-NENormalizationLayer::NENormalizationLayer()
- : _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared()
+NENormalizationLayer::NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared()
{
}
@@ -44,6 +44,9 @@
TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
_input_squared.allocator()->init(tensor_info);
+ // Manage intermediate buffers
+ _memory_group.manage(&_input_squared);
+
// Configure kernels
_norm_kernel.configure(input, &_input_squared, output, norm_info);
_multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
@@ -55,7 +58,11 @@
void NENormalizationLayer::run()
{
+ _memory_group.acquire();
+
NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
NEScheduler::get().schedule(&_border_handler, Window::DimY);
NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index 49135e4..e90d8f6 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -24,7 +24,6 @@
#include "arm_compute/runtime/NEON/functions/NEOpticalFlow.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
#include "arm_compute/core/TensorInfo.h"
@@ -34,11 +33,21 @@
#include "arm_compute/runtime/Pyramid.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-NEOpticalFlow::NEOpticalFlow()
- : _func_scharr(), _kernel_tracker(), _scharr_gx(), _scharr_gy(), _new_points(nullptr), _new_points_estimates(nullptr), _old_points(nullptr), _new_points_internal(), _old_points_internal(),
+NEOpticalFlow::NEOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _func_scharr(),
+ _kernel_tracker(),
+ _scharr_gx(),
+ _scharr_gy(),
+ _new_points(nullptr),
+ _new_points_estimates(nullptr),
+ _old_points(nullptr),
+ _new_points_internal(),
+ _old_points_internal(),
_num_levels(0)
{
}
@@ -65,10 +74,10 @@
const float pyr_scale = old_pyramid->info()->scale();
- _func_scharr = arm_compute::cpp14::make_unique<NEScharr3x3[]>(_num_levels);
- _kernel_tracker = arm_compute::cpp14::make_unique<NELKTrackerKernel[]>(_num_levels);
- _scharr_gx = arm_compute::cpp14::make_unique<Tensor[]>(_num_levels);
- _scharr_gy = arm_compute::cpp14::make_unique<Tensor[]>(_num_levels);
+ _func_scharr = arm_compute::support::cpp14::make_unique<NEScharr3x3[]>(_num_levels);
+ _kernel_tracker = arm_compute::support::cpp14::make_unique<NELKTrackerKernel[]>(_num_levels);
+ _scharr_gx = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_levels);
+ _scharr_gy = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_levels);
_old_points_internal = LKInternalKeypointArray(old_points->num_values());
_new_points_internal = LKInternalKeypointArray(old_points->num_values());
@@ -89,6 +98,10 @@
_scharr_gx[i].allocator()->init(tensor_info);
_scharr_gy[i].allocator()->init(tensor_info);
+ // Manage intermediate buffers
+ _memory_group.manage(_scharr_gx.get() + i);
+ _memory_group.manage(_scharr_gy.get() + i);
+
// Init Scharr kernel
_func_scharr[i].configure(old_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i, border_mode, constant_border_value);
@@ -108,6 +121,8 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
+ _memory_group.acquire();
+
for(unsigned int level = _num_levels; level > 0; --level)
{
// Run Scharr kernel
@@ -116,4 +131,6 @@
// Run Lucas-Kanade kernel
NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX);
}
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEPhase.cpp b/src/runtime/NEON/functions/NEPhase.cpp
index 7683f46..436d22f 100644
--- a/src/runtime/NEON/functions/NEPhase.cpp
+++ b/src/runtime/NEON/functions/NEPhase.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEPhase.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
k->configure(input1, input2, nullptr, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 056d33b..2e2ea11 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
- auto k = arm_compute::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 6f0cc4f..4c4e11f 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -23,15 +23,15 @@
*/
#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
{
// Configure pooling kernel
- auto k = arm_compute::cpp14::make_unique<NEPoolingLayerKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEPoolingLayerKernel>();
k->configure(input, output, pool_info);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
new file mode 100644
index 0000000..a131c48
--- /dev/null
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEQuantizationLayer::NEQuantizationLayer()
+ : _quantize_kernel(), _min_max_kernel(), _min_max()
+{
+}
+
+void NEQuantizationLayer::configure(const ITensor *input, ITensor *output)
+{
+ // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel
+ _min_max_kernel.configure(input, &_min_max);
+
+ // Configure quantize kernel
+ _quantize_kernel.configure(input, output, &_min_max);
+
+ // Allocate min_max tensor
+ _min_max.allocator()->allocate();
+}
+
+void NEQuantizationLayer::run()
+{
+ // Reset min and max
+ _min_max_kernel.reset();
+
+ // Run min and max kernel
+ NEScheduler::get().schedule(&_min_max_kernel, Window::DimY);
+
+ // Run quantize kernel
+ NEScheduler::get().schedule(&_quantize_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
new file mode 100644
index 0000000..1f1400c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEROIPoolingLayer::NEROIPoolingLayer()
+ : _roi_kernel()
+{
+}
+
+void NEROIPoolingLayer::configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+ _roi_kernel.configure(input, rois, output, pool_info);
+}
+
+void NEROIPoolingLayer::run()
+{
+ NEScheduler::get().schedule(&_roi_kernel, Window::DimX);
+}
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
new file mode 100644
index 0000000..45c3e5d
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+namespace
+{
+/** Define dimension to split the window
+ *
+ * @param[in] axis Reduction axis
+ *
+ * @return The dimension to split the window
+ */
+size_t reduction_window_split_dimension(unsigned int axis)
+{
+ switch(axis)
+ {
+ case 0:
+ return Window::DimY;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported reduction axis");
+ }
+}
+BorderMode reduction_operation_border_mode(ReductionOperation op)
+{
+ switch(op)
+ {
+ case ReductionOperation::SUM_SQUARE:
+ return BorderMode::CONSTANT;
+ default:
+ return BorderMode::CONSTANT;
+ }
+}
+} // namespace
+
+NEReductionOperation::NEReductionOperation()
+ : _reduction_kernel(), _fill_border_kernel(), _window_split(0)
+{
+}
+
+void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+
+ // Configure reduction kernel
+ _reduction_kernel.configure(input, output, axis, op);
+ _window_split = reduction_window_split_dimension(axis);
+
+ // Configure fill border kernel
+ BorderSize fill_border_size = (axis == 0) ? _reduction_kernel.border_size() : BorderSize();
+ BorderMode fill_border_mode = reduction_operation_border_mode(op);
+ _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(0));
+}
+
+void NEReductionOperation::run()
+{
+ NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_reduction_kernel, _window_split);
+}
diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
index 9f06fb6..882e93b 100644
--- a/src/runtime/NEON/functions/NERemap.cpp
+++ b/src/runtime/NEON/functions/NERemap.cpp
@@ -24,13 +24,13 @@
#include "arm_compute/runtime/NEON/functions/NERemap.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -44,7 +44,7 @@
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
- auto k = arm_compute::cpp14::make_unique<NERemapKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NERemapKernel>();
k->configure(input, map_x, map_y, output, policy);
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
new file mode 100644
index 0000000..fef4e0c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEReshapeLayer::configure(const ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEReshapeLayerKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index b70f626..bbd3fac 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -27,11 +27,12 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
#include <cmath>
#include <cstddef>
@@ -85,12 +86,16 @@
}
} // namespace
-NEScale::NEScale()
- : _offsets(), _dx(), _dy()
+NEScale::NEScale() // NOLINT
+ : _offsets(),
+ _dx(),
+ _dy(),
+ _scale_kernel(),
+ _border_handler()
{
}
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
{
ARM_COMPUTE_ERROR_ON(nullptr == input);
ARM_COMPUTE_ERROR_ON(nullptr == output);
@@ -116,8 +121,6 @@
policy = InterpolationPolicy::NEAREST_NEIGHBOR;
}
- auto k = arm_compute::cpp14::make_unique<NEScaleKernel>();
-
// Check if the border mode is UNDEFINED
const bool border_undefined = border_mode == BorderMode::UNDEFINED;
@@ -128,7 +131,7 @@
TensorInfo tensor_info_offsets(shape, Format::S32);
_offsets.allocator()->init(tensor_info_offsets);
- k->configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
+ _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
// Allocate once the configure methods have been called
_offsets.allocator()->allocate();
@@ -146,7 +149,7 @@
_dx.allocator()->init(tensor_info_dxdy);
_dy.allocator()->init(tensor_info_dxdy);
- k->configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined);
+ _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined);
// Allocate once the configure methods have been called
_offsets.allocator()->allocate();
@@ -159,13 +162,18 @@
}
case InterpolationPolicy::AREA:
{
- k->configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
+ _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
break;
}
default:
ARM_COMPUTE_ERROR("Unsupported interpolation mode");
}
- _kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void NEScale::run()
+{
+ NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+ NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEScharr3x3.cpp b/src/runtime/NEON/functions/NEScharr3x3.cpp
index 04b3f14..ba9985e 100644
--- a/src/runtime/NEON/functions/NEScharr3x3.cpp
+++ b/src/runtime/NEON/functions/NEScharr3x3.cpp
@@ -23,9 +23,9 @@
*/
#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void NEScharr3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<NEScharr3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEScharr3x3Kernel>();
k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NESobel3x3.cpp b/src/runtime/NEON/functions/NESobel3x3.cpp
index 3b46fd7..753b1f6 100644
--- a/src/runtime/NEON/functions/NESobel3x3.cpp
+++ b/src/runtime/NEON/functions/NESobel3x3.cpp
@@ -23,9 +23,9 @@
*/
#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void NESobel3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<NESobel3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NESobel3x3Kernel>();
k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index 8967a22..d8f4eda 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -32,8 +32,8 @@
using namespace arm_compute;
-NESobel5x5::NESobel5x5()
- : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
+NESobel5x5::NESobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
{
}
@@ -50,6 +50,8 @@
{
_tmp_x.allocator()->init(tensor_info);
_tmp_y.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_x);
+ _memory_group.manage(&_tmp_y);
_sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
@@ -58,6 +60,7 @@
else if(run_sobel_x)
{
_tmp_x.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_x);
_sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
@@ -65,6 +68,7 @@
else if(run_sobel_y)
{
_tmp_y.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_y);
_sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_y.allocator()->allocate();
@@ -75,7 +79,12 @@
void NESobel5x5::run()
{
- _border_handler.run(_border_handler.window());
+ NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+
+ _memory_group.acquire();
+
NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index f628da9..5b6f60b 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -32,8 +32,8 @@
using namespace arm_compute;
-NESobel7x7::NESobel7x7()
- : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
+NESobel7x7::NESobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
{
}
@@ -50,6 +50,8 @@
{
_tmp_x.allocator()->init(tensor_info);
_tmp_y.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_x);
+ _memory_group.manage(&_tmp_y);
_sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
@@ -58,6 +60,7 @@
else if(run_sobel_x)
{
_tmp_x.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_x);
_sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
@@ -65,6 +68,7 @@
else if(run_sobel_y)
{
_tmp_y.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_y);
_sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_y.allocator()->allocate();
@@ -75,7 +79,12 @@
void NESobel7x7::run()
{
- _border_handler.run(_border_handler.window());
+ NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+
+ _memory_group.acquire();
+
NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 0651eab..cc5d4e9 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -31,15 +31,14 @@
using namespace arm_compute;
-NESoftmaxLayer::NESoftmaxLayer()
- : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
+NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
{
}
void NESoftmaxLayer::configure(ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
// Create intermediate tensors shapes
TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
@@ -51,11 +50,16 @@
_max.allocator()->init(tensor_info_max_sum);
_sum.allocator()->init(tensor_info_max_sum);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp);
+ _memory_group.manage(&_max);
+ _memory_group.manage(&_sum);
+
// Configure Kernels
_max_kernel.configure(input, &_max);
_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
_norm_kernel.configure(&_tmp, &_sum, output);
- _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-FLT_MAX));
+ _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE);
// Allocate intermediate tensors
_tmp.allocator()->allocate();
@@ -65,8 +69,12 @@
void NESoftmaxLayer::run()
{
+ _memory_group.acquire();
+
NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
NEScheduler::get().schedule(&_max_kernel, Window::DimY);
NEScheduler::get().schedule(&_shift_exp_sum_kernel, Window::DimY);
NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+
+ _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NETableLookup.cpp b/src/runtime/NEON/functions/NETableLookup.cpp
index ebb8a0a..cae117a 100644
--- a/src/runtime/NEON/functions/NETableLookup.cpp
+++ b/src/runtime/NEON/functions/NETableLookup.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NETableLookup.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void NETableLookup::configure(const ITensor *input, const ILut *lut, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NETableLookupKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NETableLookupKernel>();
k->configure(input, lut, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEThreshold.cpp b/src/runtime/NEON/functions/NEThreshold.cpp
index 93dc124..37883e5 100644
--- a/src/runtime/NEON/functions/NEThreshold.cpp
+++ b/src/runtime/NEON/functions/NEThreshold.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEThreshold.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void NEThreshold::configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
{
- auto k = arm_compute::cpp14::make_unique<NEThresholdKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEThresholdKernel>();
k->configure(input, output, threshold, false_value, true_value, type, upper);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index 53ac9c5..eb81e02 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NETranspose.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void NETranspose::configure(const ITensor *input, ITensor *output)
{
- auto k = arm_compute::cpp14::make_unique<NETransposeKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEWarpAffine.cpp b/src/runtime/NEON/functions/NEWarpAffine.cpp
index 24fb16f..889d827 100644
--- a/src/runtime/NEON/functions/NEWarpAffine.cpp
+++ b/src/runtime/NEON/functions/NEWarpAffine.cpp
@@ -24,8 +24,9 @@
#include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -41,14 +42,14 @@
{
case InterpolationPolicy::NEAREST_NEIGHBOR:
{
- auto k = arm_compute::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
k->configure(input, output, matrix, border_mode, constant_border_value);
_kernel = std::move(k);
break;
}
case InterpolationPolicy::BILINEAR:
{
- auto k = arm_compute::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::BILINEAR>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::BILINEAR>>();
k->configure(input, output, matrix, border_mode, constant_border_value);
_kernel = std::move(k);
break;
diff --git a/src/runtime/NEON/functions/NEWarpPerspective.cpp b/src/runtime/NEON/functions/NEWarpPerspective.cpp
index 84b2df5..ed5d6a0 100644
--- a/src/runtime/NEON/functions/NEWarpPerspective.cpp
+++ b/src/runtime/NEON/functions/NEWarpPerspective.cpp
@@ -24,8 +24,9 @@
#include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -41,14 +42,14 @@
{
case InterpolationPolicy::NEAREST_NEIGHBOR:
{
- auto k = arm_compute::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
k->configure(input, output, matrix, border_mode, constant_border_value);
_kernel = std::move(k);
break;
}
case InterpolationPolicy::BILINEAR:
{
- auto k = arm_compute::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::BILINEAR>>();
+ auto k = arm_compute::support::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::BILINEAR>>();
k->configure(input, output, matrix, border_mode, constant_border_value);
_kernel = std::move(k);
break;
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index 0cced73..be81641 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -38,7 +38,7 @@
return scheduler;
}
-OMPScheduler::OMPScheduler()
+OMPScheduler::OMPScheduler() // NOLINT
: _num_threads(omp_get_max_threads())
{
}
diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp
new file mode 100644
index 0000000..42cc943
--- /dev/null
+++ b/src/runtime/PoolManager.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/PoolManager.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IMemoryPool.h"
+#include "support/ToolchainSupport.h"
+
+#include <list>
+
+using namespace arm_compute;
+
+PoolManager::PoolManager()
+ : _free_pools(), _occupied_pools(), _sem(), _mtx()
+{
+}
+
+IMemoryPool *PoolManager::lock_pool()
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!");
+
+ _sem->wait();
+ std::lock_guard<arm_compute::Mutex> lock(_mtx);
+ ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty(), "Empty pool must exist as semaphore has been signalled");
+ _occupied_pools.splice(std::begin(_occupied_pools), _free_pools, std::begin(_free_pools));
+ return _occupied_pools.front().get();
+}
+
+void PoolManager::unlock_pool(IMemoryPool *pool)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!");
+
+ std::lock_guard<arm_compute::Mutex> lock(_mtx);
+ auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), [pool](const std::unique_ptr<IMemoryPool> &pool_it)
+ {
+ return pool_it.get() == pool;
+ });
+ ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_occupied_pools), "Pool to be unlocked couldn't be found!");
+ _free_pools.splice(std::begin(_free_pools), _occupied_pools, it);
+ _sem->signal();
+}
+
+void PoolManager::register_pool(std::unique_ptr<IMemoryPool> pool)
+{
+ std::lock_guard<arm_compute::Mutex> lock(_mtx);
+ ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to register a new one!");
+
+ // Set pool
+ _free_pools.push_front(std::move(pool));
+
+ // Update semaphore
+ _sem = arm_compute::support::cpp14::make_unique<arm_compute::Semaphore>(_free_pools.size());
+}
diff --git a/src/runtime/Pyramid.cpp b/src/runtime/Pyramid.cpp
index f1b6c93..ebd6570 100644
--- a/src/runtime/Pyramid.cpp
+++ b/src/runtime/Pyramid.cpp
@@ -24,10 +24,10 @@
#include "arm_compute/runtime/Pyramid.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PyramidInfo.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/TensorShape.h"
+#include "support/ToolchainSupport.h"
#include <cmath>
@@ -46,7 +46,7 @@
void Pyramid::internal_init(const PyramidInfo &info, bool auto_padding)
{
_info = info;
- _pyramid = arm_compute::cpp14::make_unique<Tensor[]>(_info.num_levels());
+ _pyramid = arm_compute::support::cpp14::make_unique<Tensor[]>(_info.num_levels());
size_t w = _info.width();
size_t h = _info.height();
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
index a131928..505c4a3 100644
--- a/src/runtime/Scheduler.cpp
+++ b/src/runtime/Scheduler.cpp
@@ -26,13 +26,13 @@
#include "arm_compute/core/Error.h"
#if ARM_COMPUTE_CPP_SCHEDULER
#include "arm_compute/runtime/CPP/CPPScheduler.h"
-#endif
+#endif /* ARM_COMPUTE_CPP_SCHEDULER */
#include "arm_compute/runtime/SingleThreadScheduler.h"
#if ARM_COMPUTE_OPENMP_SCHEDULER
#include "arm_compute/runtime/OMP/OMPScheduler.h"
-#endif
+#endif /* ARM_COMPUTE_OPENMP_SCHEDULER */
using namespace arm_compute;
@@ -42,9 +42,9 @@
Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
#elif ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER
Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
-#else
+#else /* ARM_COMPUTE_*_SCHEDULER */
Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
-#endif
+#endif /* ARM_COMPUTE_*_SCHEDULER */
void Scheduler::set(Type t)
{
@@ -64,17 +64,17 @@
{
#if ARM_COMPUTE_CPP_SCHEDULER
return true;
-#else
+#else /* ARM_COMPUTE_CPP_SCHEDULER */
return false;
-#endif
+#endif /* ARM_COMPUTE_CPP_SCHEDULER */
}
case Type::OMP:
{
#if ARM_COMPUTE_OPENMP_SCHEDULER
return true;
-#else
+#else /* ARM_COMPUTE_OPENMP_SCHEDULER */
return false;
-#endif
+#endif /* ARM_COMPUTE_OPENMP_SCHEDULER */
}
case Type::CUSTOM:
{
@@ -105,18 +105,18 @@
{
#if ARM_COMPUTE_CPP_SCHEDULER
return CPPScheduler::get();
-#else
+#else /* ARM_COMPUTE_CPP_SCHEDULER */
ARM_COMPUTE_ERROR("Recompile with cppthreads=1 to use C++11 scheduler.");
-#endif
+#endif /* ARM_COMPUTE_CPP_SCHEDULER */
break;
}
case Type::OMP:
{
#if ARM_COMPUTE_OPENMP_SCHEDULER
return OMPScheduler::get();
-#else
+#else /* ARM_COMPUTE_OPENMP_SCHEDULER */
ARM_COMPUTE_ERROR("Recompile with openmp=1 to use openmp scheduler.");
-#endif
+#endif /* ARM_COMPUTE_OPENMP_SCHEDULER */
break;
}
case Type::CUSTOM:
diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
index 435068c..a76c37e 100644
--- a/src/runtime/Tensor.cpp
+++ b/src/runtime/Tensor.cpp
@@ -26,7 +26,7 @@
using namespace arm_compute;
Tensor::Tensor()
- : _allocator()
+ : _allocator(this)
{
}
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 5c719c7..272b9f5 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/Coordinates.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/MemoryGroup.h"
#include <cstddef>
@@ -63,11 +64,50 @@
}
} // namespace
-TensorAllocator::TensorAllocator()
- : _buffer(nullptr)
+TensorAllocator::TensorAllocator(Tensor *owner)
+ : _associated_memory_group(nullptr), _buffer(nullptr), _owner(owner)
{
}
+TensorAllocator::~TensorAllocator()
+{
+ if((_associated_memory_group == nullptr) && (_buffer != nullptr))
+ {
+ delete[] _buffer;
+ _buffer = nullptr;
+ info().set_is_resizable(true);
+ }
+}
+
+TensorAllocator::TensorAllocator(TensorAllocator &&o) noexcept
+ : ITensorAllocator(std::move(o)),
+ _associated_memory_group(o._associated_memory_group),
+ _buffer(o._buffer),
+ _owner(o._owner)
+{
+ o._associated_memory_group = nullptr;
+ o._buffer = nullptr;
+ o._owner = nullptr;
+}
+
+TensorAllocator &TensorAllocator::operator=(TensorAllocator &&o) noexcept
+{
+ if(&o != this)
+ {
+ _associated_memory_group = o._associated_memory_group;
+ o._associated_memory_group = nullptr;
+
+ _buffer = o._buffer;
+ o._buffer = nullptr;
+
+ _owner = o._owner;
+ o._owner = nullptr;
+
+ ITensorAllocator::operator=(std::move(o));
+ }
+ return *this;
+}
+
void TensorAllocator::init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo sub_info)
{
// Get parent info
@@ -90,28 +130,44 @@
uint8_t *TensorAllocator::data() const
{
- return (_buffer != nullptr) ? _buffer.get()->data() : nullptr;
+ return _buffer;
}
void TensorAllocator::allocate()
{
ARM_COMPUTE_ERROR_ON(_buffer != nullptr);
-
- _buffer = std::make_shared<std::vector<uint8_t>>(info().total_size());
+ if(_associated_memory_group == nullptr)
+ {
+ _buffer = new uint8_t[info().total_size()]();
+ }
+ else
+ {
+ _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_buffer), info().total_size());
+ }
info().set_is_resizable(false);
}
void TensorAllocator::free()
{
- ARM_COMPUTE_ERROR_ON(_buffer == nullptr);
+ if((_associated_memory_group == nullptr) && (_buffer != nullptr))
+ {
+ delete[] _buffer;
+ _buffer = nullptr;
+ info().set_is_resizable(true);
+ }
+}
- _buffer.reset();
- info().set_is_resizable(true);
+void TensorAllocator::set_associated_memory_group(MemoryGroup *associated_memory_group)
+{
+ ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
+ ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
+ ARM_COMPUTE_ERROR_ON(_buffer != nullptr);
+ _associated_memory_group = associated_memory_group;
}
uint8_t *TensorAllocator::lock()
{
- return (_buffer != nullptr) ? _buffer.get()->data() : nullptr;
+ return _buffer;
}
void TensorAllocator::unlock()