arm_compute v17.06
diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
index 3c32e2b..b75ebcf 100644
--- a/src/core/AccessWindowAutoPadding.cpp
+++ b/src/core/AccessWindowAutoPadding.cpp
@@ -23,12 +23,12 @@
*/
#include "arm_compute/core/AccessWindowAutoPadding.h"
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/Window.h"
using namespace arm_compute;
-AccessWindowAutoPadding::AccessWindowAutoPadding(TensorInfo *info)
+AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info)
: _info(info)
{
}
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index d3eb666..8b6419c 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp
@@ -24,12 +24,12 @@
#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/Window.h"
using namespace arm_compute;
-AccessWindowStatic::AccessWindowStatic(TensorInfo *info, int start_x, int start_y, int end_x, int end_y)
+AccessWindowStatic::AccessWindowStatic(ITensorInfo *info, int start_x, int start_y, int end_x, int end_y)
: _info(info), _start_x(start_x), _start_y(start_y), _end_x(end_x), _end_y(end_y)
{
}
@@ -39,7 +39,7 @@
ARM_COMPUTE_UNUSED(border_undefined);
ARM_COMPUTE_UNUSED(border_size);
- return compute_valid_region(window, std::move(input_valid_region));
+ return compute_valid_region(window, input_valid_region);
}
ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region) const
@@ -55,12 +55,18 @@
// Start of the valid region is equal to the start of the static access but
// never outside of the tensor.
anchor.set(0, std::max<int>(0, _start_x));
- anchor.set(1, std::max<int>(0, _start_y));
+ if(_info->num_dimensions() > 1)
+ {
+ anchor.set(1, std::max<int>(0, _start_y));
+ }
// End of the valid region is equal to the end of the static access but
// never outside of the tensor.
shape.set(0, std::min<int>(_end_x, _info->tensor_shape()[0]));
- shape.set(1, std::min<int>(_end_y, _info->tensor_shape()[1]));
+ if(_info->num_dimensions() > 1)
+ {
+ shape.set(1, std::min<int>(_end_y, _info->tensor_shape()[1]));
+ }
// For higher dimension use the intersection of the window size and the
// valid region of the input
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index 45d4062..b3605c4 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/core/AccessWindowTranspose.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/Window.h"
using namespace arm_compute;
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 76821c6..21b72dd 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -22,10 +22,48 @@
* SOFTWARE.
*/
#include "arm_compute/core/CL/CLHelpers.h"
-
+#include "arm_compute/core/CL/CLTypes.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Types.h"
+#include <map>
+#include <vector>
+
+namespace
+{
+arm_compute::GPUTarget get_bifrost_target(const std::string &name)
+{
+ arm_compute::GPUTarget target = arm_compute::GPUTarget::MIDGARD;
+
+ if(name == "G7")
+ {
+ target = arm_compute::GPUTarget::G70;
+ }
+
+ return target;
+}
+
+arm_compute::GPUTarget get_midgard_target(const std::string &name)
+{
+ arm_compute::GPUTarget target = arm_compute::GPUTarget::MIDGARD;
+
+ if(name == "T6")
+ {
+ target = arm_compute::GPUTarget::T600;
+ }
+ else if(name == "T7")
+ {
+ target = arm_compute::GPUTarget::T700;
+ }
+ else if(name == "T8")
+ {
+ target = arm_compute::GPUTarget::T800;
+ }
+
+ return target;
+}
+} // namespace
+
namespace arm_compute
{
std::string get_cl_type_from_data_type(const DataType &dt)
@@ -57,4 +95,71 @@
return "";
}
}
+
+const std::string &string_from_target(GPUTarget target)
+{
+ static std::map<GPUTarget, const std::string> gpu_target_map =
+ {
+ { GPUTarget::MIDGARD, "midgard" },
+ { GPUTarget::BIFROST, "bifrost" },
+ { GPUTarget::T600, "t600" },
+ { GPUTarget::T700, "t700" },
+ { GPUTarget::T800, "t800" },
+ { GPUTarget::G70, "g70" }
+ };
+
+ return gpu_target_map[target];
+}
+
+GPUTarget get_target_from_device(cl::Device &device)
+{
+ const std::string name_mali("Mali-");
+ GPUTarget target{ GPUTarget::MIDGARD };
+
+ size_t name_size = 0;
+ std::vector<char> name;
+
+ // Query device name size
+ cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, 0, nullptr, &name_size);
+ ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (name_size == 0), "clGetDeviceInfo failed to return valid information");
+ // Resize vector
+ name.resize(name_size);
+ // Query device name
+ err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, name_size, name.data(), nullptr);
+ ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
+ ARM_COMPUTE_UNUSED(err);
+
+ std::string name_str(name.begin(), name.end());
+ auto pos = name_str.find(name_mali);
+
+ if(pos != std::string::npos)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG((pos + name_mali.size() + 2) > name_str.size(), "Device name is shorter than expected.");
+ std::string sub_name = name_str.substr(pos + name_mali.size(), 2);
+
+ if(sub_name[0] == 'G')
+ {
+ target = get_bifrost_target(sub_name);
+ }
+ else if(sub_name[0] == 'T')
+ {
+ target = get_midgard_target(sub_name);
+ }
+ else
+ {
+ ARM_COMPUTE_INFO("Mali GPU unknown. Target is set to the default one.");
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_INFO("Can't find valid Mali GPU. Target is set to the default one.");
+ }
+
+ return target;
+}
+
+GPUTarget get_arch_from_target(GPUTarget target)
+{
+ return (target & GPUTarget::GPU_ARCH_MASK);
+}
} // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index bc12aad..15a5d90 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -124,6 +124,7 @@
{ "channel_extract_YUYV422", "channel_extract.cl" },
{ "combine_gradients_L1", "canny.cl" },
{ "combine_gradients_L2", "canny.cl" },
+ { "concatenate_depth", "concatenate.cl" },
{ "convolution_rectangle", "convolution_rectangle.cl" },
{ "col2im", "convolution_layer.cl" },
{ "convolution3x3_static", "convolution3x3.cl" },
@@ -159,9 +160,11 @@
{ "gemm_ma_f32", "gemm.cl" },
{ "gemm_mm_u8", "gemm.cl" },
{ "gemm_mm_f16", "gemm.cl" },
- { "gemm_mm_f32", "gemm.cl" },
+ { "gemm_mm_f32_midgard", "gemm.cl" },
+ { "gemm_mm_f32_bifrost", "gemm.cl" },
{ "gemm_vm_f16", "gemm.cl" },
{ "gemm_vm_f32", "gemm.cl" },
+ { "gemm_lc_vm_f32", "gemm.cl" },
{ "gemm_transpose1x16_u8", "gemm.cl" },
{ "gemm_transpose1x8_f16", "gemm.cl" },
{ "gemm_transpose1x4_f32", "gemm.cl" },
@@ -172,6 +175,9 @@
{ "hist_border_kernel_fixed", "histogram.cl" },
{ "hist_local_kernel", "histogram.cl" },
{ "hist_local_kernel_fixed", "histogram.cl" },
+ { "hog_block_normalization", "hog.cl" },
+ { "hog_detector", "hog.cl" },
+ { "hog_orientation_binning", "hog.cl" },
{ "hysteresis", "canny.cl" },
{ "im2col_generic", "convolution_layer.cl" },
{ "im2col_reduced", "convolution_layer.cl" },
@@ -199,7 +205,8 @@
{ "non_linear_filter_disk5x5", "non_linear_filter5x5.cl" },
{ "non_max_suppression", "nonmax.cl" },
{ "normalization_layer_cross_map", "normalization_layer.cl" },
- { "normalization_layer_in_map", "normalization_layer.cl" },
+ { "normalization_layer_in_map_1D", "normalization_layer.cl" },
+ { "batchnormalization_layer", "batchnormalization_layer.cl" },
{ "NV12_to_IYUV_bt709", "color_convert.cl" },
{ "NV12_to_RGB888_bt709", "color_convert.cl" },
{ "NV12_to_RGBA8888_bt709", "color_convert.cl" },
@@ -290,6 +297,10 @@
#include "./cl_kernels/channel_extract.clembed"
},
{
+ "concatenate.cl",
+#include "./cl_kernels/concatenate.clembed"
+ },
+ {
"color_convert.cl",
#include "./cl_kernels/color_convert.clembed"
},
@@ -362,6 +373,10 @@
#include "./cl_kernels/histogram.clembed"
},
{
+ "hog.cl",
+#include "./cl_kernels/hog.clembed"
+ },
+ {
"integral_image.cl",
#include "./cl_kernels/integral_image.clembed"
},
@@ -398,6 +413,10 @@
#include "./cl_kernels/normalization_layer.clembed"
},
{
+ "batchnormalization_layer.cl",
+#include "./cl_kernels/batchnormalization_layer.clembed"
+ },
+ {
"optical_flow_pyramid_lk.cl",
#include "./cl_kernels/optical_flow_pyramid_lk.clembed"
},
diff --git a/src/core/CL/ICLHOG.cpp b/src/core/CL/ICLHOG.cpp
new file mode 100644
index 0000000..e182997
--- /dev/null
+++ b/src/core/CL/ICLHOG.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLHOG.h"
+
+using namespace arm_compute;
+
+ICLHOG::ICLHOG()
+ : _mapping(nullptr)
+{
+}
+
+void ICLHOG::map(cl::CommandQueue &q, bool blocking)
+{
+ _mapping = do_map(q, blocking);
+}
+
+void ICLHOG::unmap(cl::CommandQueue &q)
+{
+ do_unmap(q);
+ _mapping = nullptr;
+}
+
+float *ICLHOG::descriptor() const
+{
+ return reinterpret_cast<float *>(_mapping);
+}
\ No newline at end of file
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index db6212f..7ac0fe3 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -30,6 +31,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
#include <cstddef>
@@ -59,7 +61,7 @@
}
ICLKernel::ICLKernel()
- : _kernel(nullptr), _lws_hint(cl::Range_128_1)
+ : _kernel(nullptr), _lws_hint(cl::Range_128_1), _target(CLScheduler::get().target())
{
}
@@ -78,10 +80,9 @@
void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window)
{
ARM_COMPUTE_ERROR_ON(tensor == nullptr);
- ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(window, tensor->info()->num_dimensions());
- const TensorInfo *info = tensor->info();
- const Strides &strides = info->strides_in_bytes();
+ const ITensorInfo *info = tensor->info();
+ const Strides &strides = info->strides_in_bytes();
// Calculate offset to the start of the window
unsigned int offset_first_element = info->offset_first_element_in_bytes();
@@ -136,3 +137,18 @@
{
return num_arguments_per_tensor<3>();
}
+
+void ICLKernel::set_target(cl::Device &device)
+{
+ _target = get_target_from_device(device);
+}
+
+void ICLKernel::set_target(GPUTarget target)
+{
+ _target = target;
+}
+
+GPUTarget ICLKernel::get_target() const
+{
+ return _target;
+}
diff --git a/src/core/CL/ICLMultiHOG.cpp b/src/core/CL/ICLMultiHOG.cpp
new file mode 100644
index 0000000..8ece566
--- /dev/null
+++ b/src/core/CL/ICLMultiHOG.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLMultiHOG.h"
+
+#include "arm_compute/core/IHOG.h"
+
+using namespace arm_compute;
+
+IHOG *ICLMultiHOG::model(size_t index)
+{
+ return cl_model(index);
+}
+
+const IHOG *ICLMultiHOG::model(size_t index) const
+{
+ return cl_model(index);
+}
\ No newline at end of file
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 80fcfb5..3b8dfd2 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -51,6 +51,8 @@
using clCreateBuffer_func = cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
using clCreateProgramWithSource_func = cl_program (*)(cl_context, cl_uint, const char **, const size_t *, cl_int *);
using clReleaseKernel_func = cl_int (*)(cl_kernel kernel);
+using clGetDeviceInfo_func = cl_int (*)(cl_device_id, cl_device_info, size_t, void *, size_t *);
+using clGetDeviceIDs_func = cl_int (*)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
class CLSymbols
{
@@ -88,6 +90,8 @@
clRetainCommandQueue = reinterpret_cast<clRetainCommandQueue_func>(dlsym(handle, "clRetainCommandQueue"));
clEnqueueUnmapMemObject = reinterpret_cast<clEnqueueUnmapMemObject_func>(dlsym(handle, "clEnqueueUnmapMemObject"));
clReleaseMemObject = reinterpret_cast<clReleaseMemObject_func>(dlsym(handle, "clReleaseMemObject"));
+ clGetDeviceInfo = reinterpret_cast<clGetDeviceInfo_func>(dlsym(handle, "clGetDeviceInfo"));
+ clGetDeviceIDs = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs"));
dlclose(handle);
}
}
@@ -123,6 +127,8 @@
clRetainCommandQueue_func clRetainCommandQueue = nullptr;
clEnqueueUnmapMemObject_func clEnqueueUnmapMemObject = nullptr;
clReleaseMemObject_func clReleaseMemObject = nullptr;
+ clGetDeviceInfo_func clGetDeviceInfo = nullptr;
+ clGetDeviceIDs_func clGetDeviceIDs = nullptr;
};
bool arm_compute::opencl_is_available()
@@ -544,3 +550,37 @@
return CL_OUT_OF_RESOURCES;
}
}
+
+cl_int clGetDeviceIDs(cl_platform_id platform,
+ cl_device_type device_type,
+ cl_uint num_entries,
+ cl_device_id *devices,
+ cl_uint *num_devices)
+{
+ auto func = CLSymbols::get().clGetDeviceIDs;
+ if(func != nullptr)
+ {
+ return func(platform, device_type, num_entries, devices, num_devices);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
+
+cl_int clGetDeviceInfo(cl_device_id device,
+ cl_device_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ auto func = CLSymbols::get().clGetDeviceInfo;
+ if(func != nullptr)
+ {
+ return func(device, param_name, param_value_size, param_value, param_value_size_ret);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
index 23142ad..e3cbb6c 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/activation_layer.cl
@@ -35,21 +35,25 @@
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
* @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32
* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
*/
__kernel void activation_layer(
- IMAGE_DECLARATION(input),
- IMAGE_DECLARATION(output))
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
{
// Get pixels pointer
- Image input = CONVERT_TO_IMAGE_STRUCT(input);
- Image output = CONVERT_TO_IMAGE_STRUCT(output);
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
// Load data
VEC_DATA_TYPE(DATA_TYPE, 16)
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
new file mode 100644
index 0000000..13e6702
--- /dev/null
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Apply batch normalization.
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: F32
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] var_ptr Pointer to the var tensor. Supported data types: F32
+ * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
+ * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: F32
+ * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
+ * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: F32
+ * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
+ * @param[in] epsilon Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer(TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output),
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(var),
+ VECTOR_DECLARATION(beta),
+ VECTOR_DECLARATION(gamma),
+ float epsilon)
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector var = CONVERT_TO_VECTOR_STRUCT(var);
+ Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+ Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+
+ float4 _in = 0;
+ float4 denominator = 0;
+ float4 numerator = 0;
+ float4 x_bar = 0;
+ float4 gamma_vec = 0;
+ float4 beta_vec = 0;
+
+ const int current_slice = get_global_id(2);
+
+ _in = vload4(0, (__global float *)in.ptr);
+ denominator = *((__global float *)(var.ptr + current_slice * var.stride_x));
+ denominator = rsqrt(denominator + epsilon);
+
+ // Calculate x bar and store results
+ numerator = *((__global float *)(mean.ptr + current_slice * mean.stride_x));
+ numerator = _in - numerator;
+ x_bar = numerator * denominator;
+
+ gamma_vec = *((__global float *)(gamma.ptr + current_slice * beta.stride_x));
+ beta_vec = *((__global float *)(beta.ptr + current_slice * beta.stride_x));
+
+ vstore4(gamma_vec * x_bar + beta_vec, 0, (__global float *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
new file mode 100644
index 0000000..00f5189
--- /dev/null
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: F32
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] offset The offset to the first valid element of the output tensor in bytes
+ */
+__kernel void concatenate_depth(
+ IMAGE_DECLARATION(src),
+ IMAGE_DECLARATION(dst),
+ unsigned int offset)
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ float4 source_values = vload4(0, (__global float *)src.ptr);
+
+ vstore4(source_values, 0, (__global float *)(dst.ptr + offset));
+}
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
index 4554dd0..bd5dfaf 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/convolution_layer.cl
@@ -44,36 +44,53 @@
* @param[in] bias_ptr Pointer to the bias tensor. Same as input
* @param[in] bias_stride_x Stride of the bias tensor in X dimension (in bytes)
* @param[in] bias_step_x bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] bias_stride_y Stride of the bias tensor in Y dimension (in bytes)
- * @param[in] bias_step_y bias_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] bias_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] width The width of the input tensor
* @param[in] height The height of the input tensor
+ * @param[in] depth The depth of the input tensor
+ * @param[in] total_filters Total number of filters. 4th dimension of the weights matrix
*/
__kernel void reshape_to_columns(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
#if defined HAS_BIAS
- IMAGE_DECLARATION(bias),
+ VECTOR_DECLARATION(bias),
#endif
- uint width, uint height)
+ uint width, uint height, uint depth, uint total_filters)
{
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ bool is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1));
- __global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y
- + get_global_id(1) * width * dst_stride_y + get_global_id(2) * width * height * dst_stride_y;
+ __global uchar *tmp_src_ptr = src.ptr;
+ __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y + get_global_id(1) * width * dst_stride_y + get_global_id(
+ 2) * width * height * dst_stride_y;
+#if defined HAS_BIAS
+ __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
+#endif
- *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr);
+ if(is_last_thread)
+ {
+ for(uint i = 0; i < total_filters; ++i)
+ {
+ *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
#if defined HAS_BIAS
- // If it is the last thread in the 3 dimensional workgroup
- if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1))
- {
- tmp_out_ptr += dst_stride_y;
- *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)(bias_ptr + bias_offset_first_element_in_bytes));
- }
-
+ *((__global DATA_TYPE *)(tmp_dst_ptr + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
+ tmp_bias_ptr += bias_stride_x;
#endif
+ tmp_src_ptr += depth * src_stride_z;
+ tmp_dst_ptr += dst_stride_x;
+ }
+ }
+ else
+ {
+ for(uint i = 0; i < total_filters; ++i)
+ {
+ *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
+ tmp_src_ptr += depth * src_stride_z;
+ tmp_dst_ptr += dst_stride_x;
+ }
+ }
}
/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index db849f5..caf6e3f 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -408,7 +408,8 @@
}
#endif
-/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+#if(defined WIDTH_MATRIX_B && defined ALPHA)
+/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
*
* @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
@@ -432,10 +433,9 @@
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-#if(defined WIDTH_MATRIX_B && defined ALPHA)
-__kernel void gemm_mm_f32(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst))
+__kernel void gemm_mm_f32_midgard(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst))
{
/* src_addr.s0 = address of matrix A */
/* src_addr.s1 = address of matrix B */
@@ -508,6 +508,216 @@
vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
}
+/** This OpenCL kernel is optimised for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_f32_bifrost(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst))
+{
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);
+ __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
+
+ // Compute end row address for matrix B
+ __global float *src_end_addr_b = src_addr_b + WIDTH_MATRIX_B;
+
+ // Reset accumulators
+ float c00 = 0.0f;
+ float c01 = 0.0f;
+ float c02 = 0.0f;
+ float c03 = 0.0f;
+ float c10 = 0.0f;
+ float c11 = 0.0f;
+ float c12 = 0.0f;
+ float c13 = 0.0f;
+ float c20 = 0.0f;
+ float c21 = 0.0f;
+ float c22 = 0.0f;
+ float c23 = 0.0f;
+ float c30 = 0.0f;
+ float c31 = 0.0f;
+ float c32 = 0.0f;
+ float c33 = 0.0f;
+
+ for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ float4 a0 = vload4(0, src_addr_a);
+ float4 b0 = vload4(0, src_addr_b);
+
+ c00 = fma(a0.s0, b0.s0, c00);
+ c01 = fma(a0.s0, b0.s1, c01);
+ c02 = fma(a0.s0, b0.s2, c02);
+ c03 = fma(a0.s0, b0.s3, c03);
+
+ c10 = fma(a0.s1, b0.s0, c10);
+ c11 = fma(a0.s1, b0.s1, c11);
+ c12 = fma(a0.s1, b0.s2, c12);
+ c13 = fma(a0.s1, b0.s3, c13);
+
+ c20 = fma(a0.s2, b0.s0, c20);
+ c21 = fma(a0.s2, b0.s1, c21);
+ c22 = fma(a0.s2, b0.s2, c22);
+ c23 = fma(a0.s2, b0.s3, c23);
+
+ c30 = fma(a0.s3, b0.s0, c30);
+ c31 = fma(a0.s3, b0.s1, c31);
+ c32 = fma(a0.s3, b0.s2, c32);
+ c33 = fma(a0.s3, b0.s3, c33);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a + 4);
+ b0 = vload4(0, src_addr_b + 4);
+
+ c00 = fma(a0.s0, b0.s0, c00);
+ c01 = fma(a0.s0, b0.s1, c01);
+ c02 = fma(a0.s0, b0.s2, c02);
+ c03 = fma(a0.s0, b0.s3, c03);
+
+ c10 = fma(a0.s1, b0.s0, c10);
+ c11 = fma(a0.s1, b0.s1, c11);
+ c12 = fma(a0.s1, b0.s2, c12);
+ c13 = fma(a0.s1, b0.s3, c13);
+
+ c20 = fma(a0.s2, b0.s0, c20);
+ c21 = fma(a0.s2, b0.s1, c21);
+ c22 = fma(a0.s2, b0.s2, c22);
+ c23 = fma(a0.s2, b0.s3, c23);
+
+ c30 = fma(a0.s3, b0.s0, c30);
+ c31 = fma(a0.s3, b0.s1, c31);
+ c32 = fma(a0.s3, b0.s2, c32);
+ c33 = fma(a0.s3, b0.s3, c33);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a + 8);
+ b0 = vload4(0, src_addr_b + 8);
+
+ c00 = fma(a0.s0, b0.s0, c00);
+ c01 = fma(a0.s0, b0.s1, c01);
+ c02 = fma(a0.s0, b0.s2, c02);
+ c03 = fma(a0.s0, b0.s3, c03);
+
+ c10 = fma(a0.s1, b0.s0, c10);
+ c11 = fma(a0.s1, b0.s1, c11);
+ c12 = fma(a0.s1, b0.s2, c12);
+ c13 = fma(a0.s1, b0.s3, c13);
+
+ c20 = fma(a0.s2, b0.s0, c20);
+ c21 = fma(a0.s2, b0.s1, c21);
+ c22 = fma(a0.s2, b0.s2, c22);
+ c23 = fma(a0.s2, b0.s3, c23);
+
+ c30 = fma(a0.s3, b0.s0, c30);
+ c31 = fma(a0.s3, b0.s1, c31);
+ c32 = fma(a0.s3, b0.s2, c32);
+ c33 = fma(a0.s3, b0.s3, c33);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a + 12);
+ b0 = vload4(0, src_addr_b + 12);
+
+ c00 = fma(a0.s0, b0.s0, c00);
+ c01 = fma(a0.s0, b0.s1, c01);
+ c02 = fma(a0.s0, b0.s2, c02);
+ c03 = fma(a0.s0, b0.s3, c03);
+
+ c10 = fma(a0.s1, b0.s0, c10);
+ c11 = fma(a0.s1, b0.s1, c11);
+ c12 = fma(a0.s1, b0.s2, c12);
+ c13 = fma(a0.s1, b0.s3, c13);
+
+ c20 = fma(a0.s2, b0.s0, c20);
+ c21 = fma(a0.s2, b0.s1, c21);
+ c22 = fma(a0.s2, b0.s2, c22);
+ c23 = fma(a0.s2, b0.s3, c23);
+
+ c30 = fma(a0.s3, b0.s0, c30);
+ c31 = fma(a0.s3, b0.s1, c31);
+ c32 = fma(a0.s3, b0.s2, c32);
+ c33 = fma(a0.s3, b0.s3, c33);
+ }
+
+ for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ float4 a0 = vload4(0, src_addr_a);
+ float4 b0 = vload4(0, src_addr_b);
+
+ c00 = fma(a0.s0, b0.s0, c00);
+ c01 = fma(a0.s0, b0.s1, c01);
+ c02 = fma(a0.s0, b0.s2, c02);
+ c03 = fma(a0.s0, b0.s3, c03);
+
+ c10 = fma(a0.s1, b0.s0, c10);
+ c11 = fma(a0.s1, b0.s1, c11);
+ c12 = fma(a0.s1, b0.s2, c12);
+ c13 = fma(a0.s1, b0.s3, c13);
+
+ c20 = fma(a0.s2, b0.s0, c20);
+ c21 = fma(a0.s2, b0.s1, c21);
+ c22 = fma(a0.s2, b0.s2, c22);
+ c23 = fma(a0.s2, b0.s3, c23);
+
+ c30 = fma(a0.s3, b0.s0, c30);
+ c31 = fma(a0.s3, b0.s1, c31);
+ c32 = fma(a0.s3, b0.s2, c32);
+ c33 = fma(a0.s3, b0.s3, c33);
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Multiply by the weight of matrix product
+ c00 = c00 * ALPHA;
+ c01 = c01 * ALPHA;
+ c02 = c02 * ALPHA;
+ c03 = c03 * ALPHA;
+ c10 = c10 * ALPHA;
+ c11 = c11 * ALPHA;
+ c12 = c12 * ALPHA;
+ c13 = c13 * ALPHA;
+ c20 = c20 * ALPHA;
+ c21 = c21 * ALPHA;
+ c22 = c22 * ALPHA;
+ c23 = c23 * ALPHA;
+ c30 = c30 * ALPHA;
+ c31 = c31 * ALPHA;
+ c32 = c32 * ALPHA;
+ c33 = c33 * ALPHA;
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+ // Store 4x4 block
+ vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
+ vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
+ vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));
+ vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));
+}
+
/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f16 and @ref gemm_transpose1x8_f16 before running the matrix multiplication
*
@@ -607,6 +817,7 @@
vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
}
+#if(defined WIDTH_VECTOR_A)
/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
*
* @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
@@ -632,7 +843,6 @@
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-#if(defined WIDTH_VECTOR_A)
__kernel void gemm_vm_f32(IMAGE_DECLARATION(src0),
IMAGE_DECLARATION(src1),
IMAGE_DECLARATION(dst))
@@ -746,6 +956,7 @@
#endif /* (defined WIDTH_VECTOR_A) */
#endif /* (defined WIDTH_MATRIX_B && defined ALPHA) */
+#if(defined BETA)
/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
*
* @attention The beta's value need to be passed at compile time using -DBETA
@@ -763,7 +974,6 @@
* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
*/
-#if(defined BETA)
__kernel void gemm_ma_f32(IMAGE_DECLARATION(src),
IMAGE_DECLARATION(dst))
{
@@ -819,3 +1029,71 @@
vstore8(out, 0, (__global half *)dst.ptr);
}
#endif /* (defined BETA) */
+
+#if(defined WIDTH_VECTOR_A)
+/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
+ *
+ * @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
+ *
+ * @attention The input A and matrix B must not be reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
+ TENSOR3D_DECLARATION(src1),
+ IMAGE_DECLARATION(dst))
+{
+ int idx = get_global_id(0) * 4;
+ int idy = get_global_id(1);
+
+ /* Compute the address for the vector A and matrix B */
+ int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
+ src_addr.s1 += idx * sizeof(float);
+
+ int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+
+ float4 acc = 0.0f;
+
+ for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+ {
+ float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
+ float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+ float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+ acc += b0 * (float4)a0.s0;
+ acc += b1 * (float4)a0.s1;
+ }
+
+ for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+ {
+ float a0 = *((__global float *)(src0_ptr + src_addr.s0));
+ float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+
+ acc += b0 * (float4)a0;
+ }
+
+ /* Compute destination address */
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+}
+#endif /* (defined WIDTH_VECTOR_A) */
diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl
new file mode 100644
index 0000000..31dd57b
--- /dev/null
+++ b/src/core/CL/cl_kernels/hog.cl
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "types.h"
+
+#if(defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE)
+
+/** This OpenCL kernel computes the HOG orientation binning
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DCELL_WIDTH = Width of the cell
+ * -# -DCELL_HEIGHT = height of the cell
+ * -# -DNUM_BINS = Number of bins for each cell
+ * -# -DPHASE_SCALE = Scale factor used to evaluate the index of the local HOG
+ *
+ * @note Each work-item computes a single cell
+ *
+ * @param[in] mag_ptr Pointer to the source image which stores the magnitude of the gradient for each pixel. Supported data types: S16
+ * @param[in] mag_stride_x Stride of the magnitude image in X dimension (in bytes)
+ * @param[in] mag_step_x mag_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mag_stride_y Stride of the magnitude image in Y dimension (in bytes)
+ * @param[in] mag_step_y mag_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] mag_offset_first_element_in_bytes The offset of the first element in the magnitude image
+ * @param[in] phase_ptr Pointer to the source image which stores the phase of the gradient for each pixel. Supported data types: U8
+ * @param[in] phase_stride_x Stride of the phase image in X dimension (in bytes)
+ * @param[in] phase_step_x phase_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] phase_stride_y Stride of the the phase image in Y dimension (in bytes)
+ * @param[in] phase_step_y phase_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] phase_offset_first_element_in_bytes The offset of the first element in the the phase image
+ * @param[out] dst_ptr Pointer to the destination image which stores the local HOG for each cell Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void hog_orientation_binning(IMAGE_DECLARATION(mag),
+ IMAGE_DECLARATION(phase),
+ IMAGE_DECLARATION(dst))
+{
+ float bins[NUM_BINS] = { 0 };
+
+ // Compute address for the magnitude and phase images
+ Image mag = CONVERT_TO_IMAGE_STRUCT(mag);
+ Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
+
+ __global uchar *mag_row_ptr = mag.ptr;
+ __global uchar *phase_row_ptr = phase.ptr;
+
+ for(int yc = 0; yc < CELL_HEIGHT; ++yc)
+ {
+ int xc = 0;
+ for(; xc <= (CELL_WIDTH - 4); xc += 4)
+ {
+ // Load magnitude and phase values
+ const float4 mag_f32 = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc));
+ float4 phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc));
+
+ // Scale phase: phase * scale + 0.5f
+ phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE;
+
+ // Compute histogram index.
+ int4 hidx_s32 = convert_int4(phase_f32);
+
+ // Compute magnitude weights (w0 and w1)
+ const float4 hidx_f32 = convert_float4(hidx_s32);
+
+ // w1 = phase_f32 - hidx_s32
+ const float4 w1_f32 = phase_f32 - hidx_f32;
+
+ // w0 = 1.0 - w1
+ const float4 w0_f32 = (float4)1.0f - w1_f32;
+
+ // Calculate the weights for splitting vote
+ const float4 mag_w0_f32 = mag_f32 * w0_f32;
+ const float4 mag_w1_f32 = mag_f32 * w1_f32;
+
+ // Weighted vote between 2 bins
+
+ // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
+ hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
+
+ // Bin 0
+ bins[hidx_s32.s0] += mag_w0_f32.s0;
+ bins[hidx_s32.s1] += mag_w0_f32.s1;
+ bins[hidx_s32.s2] += mag_w0_f32.s2;
+ bins[hidx_s32.s3] += mag_w0_f32.s3;
+
+ hidx_s32 += (int4)1;
+
+ // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
+ hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
+
+ // Bin1
+ bins[hidx_s32.s0] += mag_w1_f32.s0;
+ bins[hidx_s32.s1] += mag_w1_f32.s1;
+ bins[hidx_s32.s2] += mag_w1_f32.s2;
+ bins[hidx_s32.s3] += mag_w1_f32.s3;
+ }
+
+ // Left over computation
+ for(; xc < CELL_WIDTH; xc++)
+ {
+ const float mag_value = *((__global short *)mag_row_ptr + xc);
+ const float phase_value = *(mag_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
+ const float w1 = phase_value - floor(phase_value);
+
+ // The quantised phase is the histogram index [0, NUM_BINS - 1]
+ // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0
+ const uint hidx = (uint)(phase_value) % NUM_BINS;
+
+ // Weighted vote between 2 bins
+ bins[hidx] += mag_value * (1.0f - w1);
+ bins[(hidx + 1) % NUM_BINS] += mag_value * w1;
+ }
+
+ // Point to the next row of magnitude and phase images
+ mag_row_ptr += mag_stride_y;
+ phase_row_ptr += phase_stride_y;
+ }
+
+ // Compute address for the destination image
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Store the local HOG in the global memory
+ int xc = 0;
+ for(; xc <= (NUM_BINS - 4); xc += 4)
+ {
+ float4 values = vload4(0, bins + xc);
+
+ vstore4(values, 0, ((__global float *)dst.ptr) + xc);
+ }
+
+ // Left over stores
+ for(; xc < NUM_BINS; ++xc)
+ {
+ ((__global float *)dst.ptr)[xc] = bins[xc];
+ }
+}
+#endif // (defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE)
+
+#if(defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD)
+
+#ifndef L2_NORM
+#error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
+#endif
+
+#ifndef L2HYS_NORM
+#error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
+#endif
+
+#ifndef L1_NORM
+#error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
+#endif
+
+/** This OpenCL kernel computes the HOG block normalization
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DNUM_CELLS_PER_BLOCK_HEIGHT = Number of cells for each block
+ * -# -DNUM_BINS_PER_BLOCK_X = Number of bins for each block along the X direction
+ * -# -DNUM_BINS_PER_BLOCK = Number of bins for each block
+ * -# -DHOG_NORM_TYPE = Normalization type
+ * -# -DL2_HYST_THRESHOLD = Threshold used for L2HYS_NORM normalization method
+ * -# -DL2_NORM = Value of the enum class HOGNormType::L2_NORM
+ * -# -DL2HYS_NORM = Value of the enum class HOGNormType::L2HYS_NORM
+ * -# -DL1_NORM = Value of the enum class HOGNormType::L1_NORM
+ *
+ * @note Each work-item computes a single block
+ *
+ * @param[in] src_ptr Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image which stores the normlized HOG Supported data types: F32. Number of channels supported: equal to the number of histogram bins per block
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void hog_block_normalization(IMAGE_DECLARATION(src),
+ IMAGE_DECLARATION(dst))
+{
+ float sum = 0.0f;
+ float4 sum_f32 = (float4)(0.0f);
+
+ // Compute address for the source and destination tensor
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)
+ {
+ const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y);
+
+ int xc = 0;
+ for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)
+ {
+ const float4 val0 = vload4(0, hist_ptr + xc + 0);
+ const float4 val1 = vload4(0, hist_ptr + xc + 4);
+ const float4 val2 = vload4(0, hist_ptr + xc + 8);
+ const float4 val3 = vload4(0, hist_ptr + xc + 12);
+
+#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+ // Compute val^2 for L2_NORM or L2HYS_NORM
+ sum_f32 += val0 * val0;
+ sum_f32 += val1 * val1;
+ sum_f32 += val2 * val2;
+ sum_f32 += val3 * val3;
+#else
+ // Compute |val| for L1_NORM
+ sum_f32 += fabs(val0);
+ sum_f32 += fabs(val1);
+ sum_f32 += fabs(val2);
+ sum_f32 += fabs(val3);
+#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+
+ // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
+ // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
+ // will be accessed consecutively
+ vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);
+ vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);
+ vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);
+ vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);
+ }
+
+ // Compute left over
+ for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)
+ {
+ const float val = hist_ptr[xc];
+
+#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+ sum += val * val;
+#else
+ sum += fabs(val);
+#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+
+ ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
+ }
+ }
+
+ sum += dot(sum_f32, (float4)1.0f);
+
+ float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);
+
+#if(HOG_NORM_TYPE == L2HYS_NORM)
+ // Reset sum
+ sum_f32 = (float4)0.0f;
+ sum = 0.0f;
+
+ int k = 0;
+ for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)
+ {
+ float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0);
+ float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4);
+ float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8);
+ float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12);
+
+ // Scale val
+ val0 = val0 * (float4)scale;
+ val1 = val1 * (float4)scale;
+ val2 = val2 * (float4)scale;
+ val3 = val3 * (float4)scale;
+
+ // Clip val if over _threshold_l2hys
+ val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);
+ val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);
+ val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);
+ val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);
+
+ // Compute val^2
+ sum_f32 += val0 * val0;
+ sum_f32 += val1 * val1;
+ sum_f32 += val2 * val2;
+ sum_f32 += val3 * val3;
+
+ vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0);
+ vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4);
+ vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8);
+ vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12);
+ }
+
+ // Compute left over
+ for(; k < NUM_BINS_PER_BLOCK; ++k)
+ {
+ float val = ((__global float *)dst.ptr)[k] * scale;
+
+ // Clip scaled input_value if over L2_HYST_THRESHOLD
+ val = fmin(val, (float)L2_HYST_THRESHOLD);
+
+ sum += val * val;
+
+ ((__global float *)dst.ptr)[k] = val;
+ }
+
+ sum += dot(sum_f32, (float4)1.0f);
+
+ // We use the same constants of OpenCV
+ scale = 1.0f / (sqrt(sum) + 1e-3f);
+
+#endif // (HOG_NORM_TYPE == L2HYS_NORM)
+
+ int i = 0;
+ for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
+ {
+ float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0);
+ float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4);
+ float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8);
+ float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12);
+
+ // Multiply val by the normalization scale factor
+ val0 = val0 * (float4)scale;
+ val1 = val1 * (float4)scale;
+ val2 = val2 * (float4)scale;
+ val3 = val3 * (float4)scale;
+
+ vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0);
+ vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4);
+ vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8);
+ vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12);
+ }
+
+ for(; i < NUM_BINS_PER_BLOCK; ++i)
+ {
+ ((__global float *)dst.ptr)[i] *= scale;
+ }
+}
+#endif // (defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD)
+
+#if(defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && defined THRESHOLD && defined MAX_NUM_DETECTION_WINDOWS && defined IDX_CLASS && defined BLOCK_STRIDE_WIDTH && defined BLOCK_STRIDE_HEIGHT && defined DETECTION_WINDOW_WIDTH && defined DETECTION_WINDOW_HEIGHT)
+
+/** This OpenCL kernel computes the HOG detector using linear SVM
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DNUM_BLOCKS_PER_DESCRIPTOR_Y = Number of blocks per descriptor along the Y direction
+ * -# -DNUM_BINS_PER_DESCRIPTOR_X = Number of bins per descriptor along the X direction
+ * -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane
+ * -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array
+ * -# -DIDX_CLASS = Index of the class to detect
+ * -# -DBLOCK_STRIDE_WIDTH = Block stride for the X direction
+ * -# -DBLOCK_STRIDE_HEIGHT = Block stride for the Y direction
+ * -# -DDETECTION_WINDOW_WIDTH = Width of the detection window
+ * -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window
+ *
+ * @note Each work-item computes a single detection window
+ *
+ * @param[in] src_ptr Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] hog_descriptor Pointer to HOG descriptor. Supported data types: F32
+ * @param[out] dst Pointer to DetectionWindow array
+ * @param[out] num_detection_windows Number of objects detected
+ */
+__kernel void hog_detector(IMAGE_DECLARATION(src),
+ __global float *hog_descriptor,
+ __global DetectionWindow *dst,
+ __global uint *num_detection_windows)
+{
+ // Check if the DetectionWindow array is full
+ if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)
+ {
+ return;
+ }
+
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+ const int src_step_y_f32 = src_stride_y / sizeof(float);
+
+ // Init score_f32 with 0
+ float4 score_f32 = (float4)0.0f;
+
+ // Init score with 0
+ float score = 0.0f;
+
+ __global float *src_row_ptr = (__global float *)src.ptr;
+
+ // Compute Linear SVM
+ for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)
+ {
+ int xb = 0;
+
+ const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;
+
+ for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)
+ {
+ // Load descriptor values
+ float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);
+ float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);
+
+ float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);
+ float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);
+
+ // Multiply accumulate
+ score_f32 += a0_f32 * b0_f32;
+ score_f32 += a1_f32 * b1_f32;
+ }
+
+ for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)
+ {
+ const float a = src_row_ptr[xb];
+ const float b = hog_descriptor[xb + offset_y];
+
+ score += a * b;
+ }
+ }
+
+ score += dot(score_f32, (float4)1.0f);
+
+ // Add the bias. The bias is located at the position (descriptor_size() - 1)
+ // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y
+ score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];
+
+ if(score > (float)THRESHOLD)
+ {
+ int id = atomic_inc(num_detection_windows);
+ if(id < MAX_NUM_DETECTION_WINDOWS)
+ {
+ dst[id].x = get_global_id(0) * BLOCK_STRIDE_WIDTH;
+ dst[id].y = get_global_id(1) * BLOCK_STRIDE_HEIGHT;
+ dst[id].width = DETECTION_WINDOW_WIDTH;
+ dst[id].height = DETECTION_WINDOW_HEIGHT;
+ dst[id].idx_class = IDX_CLASS;
+ dst[id].score = score;
+ }
+ }
+}
+#endif // defined BIAS && defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && ...
diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl
index 015eeae..c4b0df8 100644
--- a/src/core/CL/cl_kernels/magnitude_phase.cl
+++ b/src/core/CL/cl_kernels/magnitude_phase.cl
@@ -57,10 +57,9 @@
*/
inline uchar16 phase_unsigned(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
{
- float16 arct = atan2pi(convert_float16(b), convert_float16(a));
- arct = select(arct, arct + 2, arct < 0.0f);
-
- return convert_uchar16(convert_int16(mad(arct, 90, 0.5f)) & 0xFFu);
+ float16 angle_deg_f32 = atan2pi(convert_float16(b), convert_float16(a)) * (float16)180.0f;
+ angle_deg_f32 = select(angle_deg_f32, (float16)180.0f + angle_deg_f32, angle_deg_f32 < (float16)0.0f);
+ return convert_uchar16(angle_deg_f32);
}
/** Calculates signed phase between two inputs.
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
index a913023..076b0d8 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/normalization_layer.cl
@@ -121,13 +121,13 @@
* @param[in] kappa Kappa parameter in the normalization equation
* @param[in] radius Number of elements on the right or left side to normalize across
*/
-__kernel void normalization_layer_in_map(TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(squared_input),
- TENSOR3D_DECLARATION(output),
- float coeff,
- float beta,
- float kappa,
- uint radius)
+__kernel void normalization_layer_in_map_1D(TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(squared_input),
+ TENSOR3D_DECLARATION(output),
+ float coeff,
+ float beta,
+ float kappa,
+ uint radius)
{
Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
Tensor3D squared_in = CONVERT_TO_TENSOR3D_STRUCT(squared_input);
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 89117cf..1902df9 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -50,20 +50,24 @@
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
* @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32
* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
* @param[in] max_dims The maximum index that can be accessed in x and y dimension (width + pad)
* @param[in] strides The pooling operation strides in each dimension
* @param[in] paddings The pooling operation paddings in each dimension
*/
__kernel void pooling_layer_2(
- IMAGE_DECLARATION(input),
- IMAGE_DECLARATION(output)
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output)
#ifdef POOL_AVG
,
int2 max_dims, int2 strides, int2 paddings
@@ -71,14 +75,14 @@
)
{
// Get pixels pointer
- Image input = CONVERT_TO_IMAGE_STRUCT(input);
- Image output = CONVERT_TO_IMAGE_STRUCT(output);
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
// Load data
VEC_DATA_TYPE(DATA_TYPE, 2)
- data0 = vload2(0, (__global DATA_TYPE *)offset(&input, 0, 0));
+ data0 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
VEC_DATA_TYPE(DATA_TYPE, 2)
- data1 = vload2(0, (__global DATA_TYPE *)offset(&input, 0, 1));
+ data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
// Perform calculations
data0 = POOL_OP(data0, data1);
@@ -104,20 +108,24 @@
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
* @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32
* @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
* @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
* @param[in] max_dims The maximum index that can be accessed in x and y dimension (width + pad)
* @param[in] strides The pooling operation strides in each dimension
* @param[in] paddings The pooling operation paddings in each dimension
*/
__kernel void pooling_layer_3(
- IMAGE_DECLARATION(input),
- IMAGE_DECLARATION(output)
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output)
#ifdef POOL_AVG
,
int2 max_dims, int2 strides, int2 paddings
@@ -125,16 +133,16 @@
)
{
// Get pixels pointer
- Image input = CONVERT_TO_IMAGE_STRUCT(input);
- Image output = CONVERT_TO_IMAGE_STRUCT(output);
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
// Load data
VEC_DATA_TYPE(DATA_TYPE, 3)
- data0 = vload3(0, (__global DATA_TYPE *)offset(&input, 0, 0));
+ data0 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
VEC_DATA_TYPE(DATA_TYPE, 3)
- data1 = vload3(0, (__global DATA_TYPE *)offset(&input, 0, 1));
+ data1 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
VEC_DATA_TYPE(DATA_TYPE, 3)
- data2 = vload3(0, (__global DATA_TYPE *)offset(&input, 0, 2));
+ data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
// Perform calculations
data0 = POOL_OP(data0, data1);
diff --git a/src/core/CL/cl_kernels/types.h b/src/core/CL/cl_kernels/types.h
index 9303be8..8773646 100644
--- a/src/core/CL/cl_kernels/types.h
+++ b/src/core/CL/cl_kernels/types.h
@@ -43,4 +43,14 @@
float error; /**< A tracking method specific error. Initialized to 0 by corner detectors. */
} Keypoint;
+/** Detection window struct */
+typedef struct DetectionWindow
+{
+ ushort x; /**< Top-left x coordinate */
+ ushort y; /**< Top-left y coordinate */
+ ushort width; /**< Width of the detection window */
+ ushort height; /**< Height of the detection window */
+ ushort idx_class; /**< Index of the class */
+ float score; /**< Confidence value for the detection window */
+} DetectionWindow;
#endif // ARM_COMPUTE_TYPES_H
diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
index 8228443..26a8b85 100644
--- a/src/core/CL/cl_kernels/warp_helpers.h
+++ b/src/core/CL/cl_kernels/warp_helpers.h
@@ -32,8 +32,8 @@
*/
inline const float8 clamp_to_border(float8 coords, const float width, const float height)
{
- const float4 clamped_x = clamp(coords.even, -1.0, width);
- const float4 clamped_y = clamp(coords.odd, -1.0, height);
+ const float4 clamped_x = clamp(coords.even, -1.0f, width);
+ const float4 clamped_y = clamp(coords.odd, -1.0f, height);
return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
}
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 3e4abaa..83bbe6a 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -37,9 +37,15 @@
void CLActivationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
// Set build options
std::set<std::string> build_opts;
@@ -54,5 +60,5 @@
// Make sure _kernel is initialized before calling the parent's configure
constexpr unsigned int num_elems_processed_per_iteration = 16;
- ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+ ICLSimple3DKernel::configure(input, output, num_elems_processed_per_iteration);
}
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
new file mode 100644
index 0000000..309a153
--- /dev/null
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
+ : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0)
+{
+}
+
+void CLBatchNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
+ float epsilon)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+
+ // Set build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+ _input = input;
+ _output = output;
+ _mean = mean;
+ _var = var;
+ _beta = beta;
+ _gamma = gamma;
+ _epsilon = epsilon;
+
+ // Create kernel
+ std::string kernel_name = "batchnormalization_layer";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+ // Set kernel static arguments
+ unsigned int idx = 2 * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+ _kernel.setArg<cl_float>(idx++, _epsilon);
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration = 4;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+
+ Window vector_slice = window.first_slice_window_1D();
+ vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ unsigned int idx = 2 * num_arguments_per_3D_tensor();
+ add_1D_tensor_argument(idx, _mean, vector_slice);
+ add_1D_tensor_argument(idx, _var, vector_slice);
+ add_1D_tensor_argument(idx, _beta, vector_slice);
+ add_1D_tensor_argument(idx, _gamma, vector_slice);
+
+ do
+ {
+ idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLConvolutionLayerWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLConvolutionLayerWeightsReshapeKernel.cpp
deleted file mode 100644
index 738ea31..0000000
--- a/src/core/CL/kernels/CLConvolutionLayerWeightsReshapeKernel.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLConvolutionLayerWeightsReshapeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-using namespace arm_compute;
-
-CLConvolutionLayerWeightsReshapeKernel::CLConvolutionLayerWeightsReshapeKernel()
- : _input(nullptr), _biases(nullptr), _output(nullptr)
-{
-}
-
-void CLConvolutionLayerWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4);
- ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2);
-
- // Check biases
- if(biases != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
- }
-
- _biases = biases;
- _output = output;
- _input = input;
-
- // Create build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(((biases != nullptr) ? "-DHAS_BIAS" : ""));
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts));
-
- // Configure window
- Window win = calculate_max_window(*input->info(), Steps());
- // The CLConvolutionLayerWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
- ICLKernel::configure(win);
-}
-
-void CLConvolutionLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window out_window;
- out_window.use_tensor_dimensions(_output->info());
-
- Window out_slice = out_window.first_slice_window_2D();
- Window in_slice = window.first_slice_window_3D();
-
- Window biases_slice;
- if(_biases != nullptr)
- {
- biases_slice.set(Window::DimX, Window::Dimension(0, _biases->info()->tensor_shape().x(), 1));
- }
- unsigned int increment_biases = 0;
- unsigned int increment_output = 0;
-
- // Run kernel
- do
- {
- // Set arguments
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, in_slice);
- add_2D_tensor_argument(idx, _output, out_slice);
-
- if(_biases != nullptr)
- {
- add_2D_tensor_argument(idx, _biases, biases_slice);
- biases_slice.set(Window::DimX, Window::Dimension(++increment_biases, _biases->info()->dimension(0), 1));
- }
-
- _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
- _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
- enqueue(queue, *this, in_slice);
- out_slice.set(Window::DimX, Window::Dimension(++increment_output, _output->info()->dimension(0), 1));
-
- }
- while(window.slide_window_slice_3D(in_slice));
-}
diff --git a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
new file mode 100644
index 0000000..73f1ba1
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLDepthConcatenateKernel::CLDepthConcatenateKernel()
+ : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
+{
+}
+
+BorderSize CLDepthConcatenateKernel::border_size() const
+{
+ return BorderSize(_top_bottom, _left_right);
+}
+
+void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+ // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+ // Otherwise it is not clear how the padding should be added onto the input tensor
+ ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+ ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+ _input = input;
+ _output = output;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth"));
+
+ // Configure kernel window
+ _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+ _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+ const unsigned int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2] + _left_right * output->info()->strides_in_bytes()[0] + _top_bottom *
+ output->info()->strides_in_bytes()[1];
+
+ const unsigned int num_elems_processed_per_iteration = 4;
+ const unsigned int num_elems_read_per_iteration = 4;
+ const unsigned int num_rows_read_per_iteration = 1;
+
+ // The window needs to be based on input as we copy all the depths of input
+ Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size());
+
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win,
+ AccessWindowRectangle(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration),
+ output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+ unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
+ _kernel.setArg<unsigned int>(idx, offset_to_first_elements_in_bytes);
+
+ ICLKernel::configure(win);
+}
+
+void CLDepthConcatenateKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 5bbc3ef..981aad6 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -63,6 +63,8 @@
ARM_COMPUTE_ERROR_ON(tensor == nullptr);
ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1);
+ border_size.limit(tensor->info()->padding());
+
// If there is no border: early exit
if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
{
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 522674d..d7388e8 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -24,6 +24,8 @@
#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
#include "arm_compute/core/CL/ICLTensor.h"
@@ -106,7 +108,16 @@
// Create kernel
std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_mm_" + data_type_name), build_opts));
+
+ if(data_type_name == "f32")
+ {
+ GPUTarget arch_target = get_arch_from_target(get_target());
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_f32_" + string_from_target(arch_target), build_opts));
+ }
+ else
+ {
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_" + data_type_name, build_opts));
+ }
// Configure window kernel
const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
@@ -115,7 +126,7 @@
Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
- AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+ AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
update_window_and_padding(win, input0_access, input1_access, output_access);
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 30195d9..ecee1ab 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -41,11 +41,18 @@
void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ const size_t transpose_w = 16 / input->info()->element_size();
+ output_shape.set(0, input->info()->dimension(1) * transpose_w);
+ output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(static_cast<float>(input->info()->dimension(0)) / 16.0f)) && (input->info()->data_type() == DataType::U8));
- ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(static_cast<float>(input->info()->dimension(0)) / 8.0f)) && (input->info()->data_type() == DataType::F16));
- ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(static_cast<float>(input->info()->dimension(0)) / 4.0f)) && (input->info()->data_type() == DataType::F32));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
_input = input;
_output = output;
@@ -97,3 +104,26 @@
ICLKernel::configure(win);
}
+
+void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ // Output is transposed
+ Window out_window(window);
+ out_window.set(Window::DimX, window.y());
+ out_window.set(Window::DimY, window.x());
+
+ Window in_slice = window.first_slice_window_2D();
+ Window out_slice = out_window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, in_slice);
+ add_2D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice, _lws_hint);
+ }
+ while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
new file mode 100644
index 0000000..87659c4
--- /dev/null
+++ b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLHOGOrientationBinningKernel::CLHOGOrientationBinningKernel()
+ : _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_size()
+{
+}
+
+void CLHOGOrientationBinningKernel::configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
+ ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
+ ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
+
+ _input_magnitude = input_magnitude;
+ _input_phase = input_phase;
+ _output = output;
+ _cell_size = hog_info->cell_size();
+
+ float phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? hog_info->num_bins() / 360.0f : hog_info->num_bins() / 180.0f);
+ phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
+
+ std::stringstream args_str;
+ args_str << "-DCELL_WIDTH=" << hog_info->cell_size().width << " ";
+ args_str << "-DCELL_HEIGHT=" << hog_info->cell_size().height << " ";
+ args_str << "-DNUM_BINS=" << hog_info->num_bins() << " ";
+ args_str << "-DPHASE_SCALE=" << phase_scale << " ";
+
+ // Construct kernel name
+ std::set<std::string> build_opts = {};
+ build_opts.insert(args_str.str());
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_orientation_binning", build_opts));
+
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+ constexpr unsigned int num_elems_read_per_iteration = 1;
+ const unsigned int num_rows_read_per_iteration = hog_info->cell_size().height;
+ constexpr unsigned int num_elems_written_per_iteration = 1;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win,
+ AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+ AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+ output_access);
+
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLHOGOrientationBinningKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ // Compute slice for the magnitude and phase tensors
+ Window slice_mag_phase = window.first_slice_window_2D();
+ slice_mag_phase.set(Window::DimX, Window::Dimension(window.x().start() * _cell_size.width, window.x().start() * _cell_size.width, _cell_size.width));
+ slice_mag_phase.set(Window::DimY, Window::Dimension(window.y().start() * _cell_size.height, window.y().start() * _cell_size.height, _cell_size.height));
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input_magnitude, slice_mag_phase);
+ add_2D_tensor_argument(idx, _input_phase, slice_mag_phase);
+ add_2D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
+
+CLHOGBlockNormalizationKernel::CLHOGBlockNormalizationKernel()
+ : _input(nullptr), _output(nullptr), _num_cells_per_block_stride()
+{
+}
+
+void CLHOGBlockNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info)
+{
+ ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+
+ // Number of cells per block
+ const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
+ hog_info->block_size().height / hog_info->cell_size().height);
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins() * num_cells_per_block.area(), DataType::F32);
+
+ // Number of cells per block stride
+ const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
+ hog_info->block_stride().height / hog_info->cell_size().height);
+
+ _input = input;
+ _output = output;
+ _num_cells_per_block_stride = num_cells_per_block_stride;
+
+ std::stringstream args_str;
+ args_str << "-DL2_HYST_THRESHOLD=" << hog_info->l2_hyst_threshold() << " ";
+ args_str << "-DNUM_CELLS_PER_BLOCK_HEIGHT=" << num_cells_per_block.height << " ";
+ args_str << "-DNUM_BINS_PER_BLOCK_X=" << num_cells_per_block.width *hog_info->num_bins() << " ";
+ args_str << "-DNUM_BINS_PER_BLOCK=" << _output->info()->num_channels() << " ";
+ args_str << "-DL2_NORM=" << static_cast<int>(HOGNormType::L2_NORM) << " ";
+ args_str << "-DL1_NORM=" << static_cast<int>(HOGNormType::L1_NORM) << " ";
+ args_str << "-DL2HYS_NORM=" << static_cast<int>(HOGNormType::L2HYS_NORM) << " ";
+ args_str << "-DHOG_NORM_TYPE=" << static_cast<int>(hog_info->normalization_type()) << " ";
+
+ // Construct kernel name
+ std::set<std::string> build_opts = {};
+ build_opts.insert(args_str.str());
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_block_normalization", build_opts));
+
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+ constexpr unsigned int num_elems_read_per_iteration = 1;
+ const unsigned int num_rows_read_per_iteration = num_cells_per_block.height;
+ constexpr unsigned int num_elems_written_per_iteration = 1;
+ const unsigned int num_rows_written_per_iteration = num_cells_per_block.height;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
+
+ update_window_and_padding(win,
+ AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+ output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLHOGBlockNormalizationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ // Compute slice for the magnitude and phase tensors
+ Window slice_in = window.first_slice_window_2D();
+ slice_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
+ slice_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height);
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice_in);
+ add_2D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
new file mode 100644
index 0000000..0f9a989
--- /dev/null
+++ b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLHOGDetectorKernel::CLHOGDetectorKernel()
+ : _input(nullptr), _detection_windows(), _num_detection_windows(nullptr)
+{
+}
+
+void CLHOGDetectorKernel::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride,
+ float threshold, uint16_t idx_class)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(hog == nullptr);
+ ARM_COMPUTE_ERROR_ON(detection_windows == nullptr);
+ ARM_COMPUTE_ERROR_ON(num_detection_windows == nullptr);
+ ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0);
+ ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0);
+
+ const Size2D &detection_window_size = hog->info()->detection_window_size();
+ const Size2D &block_size = hog->info()->block_size();
+ const Size2D &block_stride = hog->info()->block_stride();
+
+ _input = input;
+ _detection_windows = detection_windows;
+ _num_detection_windows = num_detection_windows;
+
+ const unsigned int num_bins_per_descriptor_x = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels();
+ const unsigned int num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1;
+
+ ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size());
+
+ std::stringstream args_str;
+ args_str << "-DNUM_BLOCKS_PER_DESCRIPTOR_Y=" << num_blocks_per_descriptor_y << " ";
+ args_str << "-DNUM_BINS_PER_DESCRIPTOR_X=" << num_bins_per_descriptor_x << " ";
+ args_str << "-DTHRESHOLD=" << threshold << " ";
+ args_str << "-DMAX_NUM_DETECTION_WINDOWS=" << detection_windows->max_num_values() << " ";
+ args_str << "-DIDX_CLASS=" << idx_class << " ";
+ args_str << "-DBLOCK_STRIDE_WIDTH=" << block_stride.width << " ";
+ args_str << "-DBLOCK_STRIDE_HEIGHT=" << block_stride.height << " ";
+ args_str << "-DDETECTION_WINDOW_WIDTH=" << detection_window_size.width << " ";
+ args_str << "-DDETECTION_WINDOW_HEIGHT=" << detection_window_size.height << " ";
+
+ // Construct kernel name
+ std::set<std::string> build_opts = {};
+ build_opts.insert(args_str.str());
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_detector", build_opts));
+
+ // Set static kernel arguments
+ unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input parameters
+ _kernel.setArg(idx++, hog->cl_buffer());
+ _kernel.setArg(idx++, detection_windows->cl_buffer());
+ _kernel.setArg(idx++, *_num_detection_windows);
+
+ // Get the number of blocks along the x and y directions of the input tensor
+ const ValidRegion &valid_region = input->info()->valid_region();
+ const size_t num_blocks_x = valid_region.shape[0];
+ const size_t num_blocks_y = valid_region.shape[1];
+
+ // Get the number of blocks along the x and y directions of the detection window
+ const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width;
+ const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height;
+
+ const size_t window_step_x = detection_window_stride.width / block_stride.width;
+ const size_t window_step_y = detection_window_stride.height / block_stride.height;
+
+ // Configure kernel window
+ Window win;
+ win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x));
+ win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y));
+
+ constexpr unsigned int num_elems_read_per_iteration = 1;
+ const unsigned int num_rows_read_per_iteration = num_blocks_per_descriptor_y;
+
+ update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration));
+
+ ICLKernel::configure(win);
+}
+
+void CLHOGDetectorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp
index a04aea8..69ede45 100644
--- a/src/core/CL/kernels/CLIntegralImageKernel.cpp
+++ b/src/core/CL/kernels/CLIntegralImageKernel.cpp
@@ -41,11 +41,26 @@
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+ _input = input;
+ _output = output;
+
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("integral_horizontal"));
- // Configure kernel
- ICLSimple2DKernel::configure(input, output, input->info()->dimension(0));
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
+ const unsigned int num_elems_accessed_per_iteration = ceil_to_multiple(num_elems_processed_per_iteration, 16);
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_accessed_per_iteration);
+
+ update_window_and_padding(win,
+ AccessWindowHorizontal(input->info(), 0, num_elems_accessed_per_iteration),
+ output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
}
CLIntegralImageVertKernel::CLIntegralImageVertKernel()
diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000..794a1bc
--- /dev/null
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLLocallyConnectedMatrixMultiplyKernel::CLLocallyConnectedMatrixMultiplyKernel()
+ : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLLocallyConnectedMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+
+ if(output->info()->dimension(1) == 196)
+ {
+ _lws_hint = cl::NDRange(1, 7);
+ }
+ else
+ {
+ _lws_hint = cl::NDRange(8, 8);
+ }
+
+ std::ostringstream mm_arguments;
+ std::set<std::string> build_opts;
+
+ mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " ";
+ build_opts.emplace(mm_arguments.str());
+
+ // Create kernel
+ std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_lc_vm_" + data_type_name), build_opts));
+
+ // Configure window kernel
+ const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+ AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+ AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+
+ update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLLocallyConnectedMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ Window matrix_b_window;
+ matrix_b_window.use_tensor_dimensions(_input1->info());
+ Window slice_matrix_b = matrix_b_window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input0, slice);
+ add_3D_tensor_argument(idx, _input1, slice_matrix_b);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, _lws_hint);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 3668513..106a511 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -50,6 +50,7 @@
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
+ ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented");
// Set build options
std::set<std::string> build_opts;
@@ -59,11 +60,12 @@
_squared_input = squared_input;
_output = output;
- const bool is_in_map = (norm_info.type() == NormType::IN_MAP);
- _border_size = (is_in_map) ? BorderSize(0, 3) : BorderSize(0);
+ const bool is_in_map = (norm_info.type() == NormType::IN_MAP_1D);
+ const unsigned int border_width = is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
+ _border_size = BorderSize(0, border_width);
// Create kernel
- std::string kernel_name = (norm_info.type() == NormType::IN_MAP) ? "normalization_layer_in_map" : "normalization_layer_cross_map";
+ std::string kernel_name = (norm_info.type() == NormType::IN_MAP_1D) ? "normalization_layer_in_map_1D" : "normalization_layer_cross_map";
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
// Set kernel static arguments
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 8fe42d2..dc5ae4e 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -131,7 +131,7 @@
};
// Set static kernel arguments
- unsigned int idx = 2 * num_arguments_per_2D_tensor();
+ unsigned int idx = 2 * num_arguments_per_3D_tensor();
_kernel.setArg<cl_int2>(idx++, max_dims);
_kernel.setArg<cl_int2>(idx++, strides);
_kernel.setArg<cl_int2>(idx++, paddings);
@@ -161,7 +161,7 @@
std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
- Window slice = window.first_slice_window_2D();
+ Window slice = window.first_slice_window_3D();
do
{
@@ -172,9 +172,9 @@
// Set inputs
unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, in_slice);
- add_2D_tensor_argument(idx, _output, slice);
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice);
}
- while(window.slide_window_slice_2D(slice));
+ while(window.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 7c709bb..0470d52 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -45,21 +45,27 @@
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- _input = input;
- _output = output;
- const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
+ _input = input;
+ _output = output;
+
+ // The kernel loops over all elements in steps of 16
+ const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
// Set build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DUSE_" + string_from_data_type(input->info()->data_type())));
- build_opts.emplace(((num_elems_processed_per_iteration % max_cl_vector_width) != 0) ? "-DNON_MULTIPLE_OF_16" : "");
+ std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+
+ // Tell the kernel that the width is not a multiple of 16
+ if((input->info()->dimension(0) % max_cl_vector_width) != 0)
+ {
+ build_opts.emplace("-DNON_MULTIPLE_OF_16");
+ }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_max", build_opts));
// Set fixed arguments
unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg<cl_uint>(idx++, num_elems_processed_per_iteration);
+ _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
// Configure kernel window
constexpr unsigned int num_elems_written_per_iteration = 1;
@@ -88,23 +94,29 @@
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
- _input = input;
- _max = max;
- _output = output;
- _sum = sum;
- const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
+ _input = input;
+ _max = max;
+ _output = output;
+ _sum = sum;
+
+ // The kernel loops over all elements in steps of 16
+ const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
// Set build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DUSE_" + string_from_data_type(input->info()->data_type())));
- build_opts.emplace(((num_elems_processed_per_iteration % max_cl_vector_width) != 0) ? "-DNON_MULTIPLE_OF_16" : "");
+ std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+
+ // Tell the kernel that the width is not a multiple of 16
+ if((input->info()->dimension(0) % max_cl_vector_width) != 0)
+ {
+ build_opts.emplace("-DNON_MULTIPLE_OF_16");
+ }
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
// Set fixed arguments
unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg<cl_uint>(idx++, num_elems_processed_per_iteration);
+ _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
// Configure window
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
@@ -190,10 +202,13 @@
do
{
+ Window sum_slice = slice;
+ sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
+
unsigned int idx = 0;
// Set inputs
add_2D_tensor_argument(idx, _input, slice);
- add_2D_tensor_argument(idx, _sum, slice);
+ add_2D_tensor_argument(idx, _sum, sum_slice);
add_2D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice);
}
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index 49931f4..2ee6fcb 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp
@@ -40,11 +40,20 @@
void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ const size_t w_out = input->info()->dimension(1);
+ const size_t h_out = input->info()->dimension(0);
+ output_shape.set(0, w_out);
+ output_shape.set(1, h_out);
+
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(1));
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(0));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
_input = input;
_output = output;
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
new file mode 100644
index 0000000..018f272
--- /dev/null
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLWeightsReshapeKernel::CLWeightsReshapeKernel(bool is_shared)
+ : _is_shared(is_shared), _input(nullptr), _biases(nullptr), _output(nullptr)
+{
+}
+
+void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ if(_is_shared)
+ {
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(4) != (output->info()->dimension(2)));
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 5);
+ ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 3);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2);
+ }
+
+ // Check biases
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+ }
+
+ _biases = biases;
+ _output = output;
+ _input = input;
+
+ // Create build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(((biases != nullptr) ? "-DHAS_BIAS" : ""));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts));
+
+ // Set static arguments
+ unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
+ idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
+ _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
+ _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
+ _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(2));
+ _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(3));
+
+ // Configure window
+ Window win = calculate_max_window(*input->info(), Steps());
+ // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ ICLKernel::configure(win);
+}
+
+CLConvolutionLayerWeightsReshapeKernel::CLConvolutionLayerWeightsReshapeKernel()
+ : CLWeightsReshapeKernel(false)
+{
+}
+
+void CLConvolutionLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window out_window;
+ out_window.use_tensor_dimensions(_output->info());
+
+ Window in_slice = window.first_slice_window_3D();
+ Window out_slice = out_window.first_slice_window_2D();
+
+ // Set arguments
+ unsigned idx = 0;
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_2D_tensor_argument(idx, _output, out_slice);
+ if(_biases != nullptr)
+ {
+ Window biases_slice;
+ biases_slice.set(Window::DimX, Window::Dimension(0, _biases->info()->tensor_shape().x(), 1));
+ add_1D_tensor_argument(idx, _biases, biases_slice);
+ }
+
+ // Run kernel
+ enqueue(queue, *this, in_slice);
+}
+
+CLLocallyConnectedLayerWeightsReshapeKernel::CLLocallyConnectedLayerWeightsReshapeKernel()
+ : CLWeightsReshapeKernel(true)
+{
+}
+
+void CLLocallyConnectedLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window out_window;
+ out_window.use_tensor_dimensions(_output->info());
+
+ Window in_slice = window.first_slice_window_3D();
+ Window out_slice = out_window.first_slice_window_2D();
+
+ Window biases_window;
+ Window biases_slice;
+
+ if(_biases != nullptr)
+ {
+ biases_window.use_tensor_dimensions(_biases->info());
+ biases_slice = biases_window.first_slice_window_1D();
+ }
+
+ do
+ {
+ // Set arguments
+ unsigned idx = 0;
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_2D_tensor_argument(idx, _output, out_slice);
+ if(_biases != nullptr)
+ {
+ add_1D_tensor_argument(idx, _biases, biases_slice);
+ biases_window.slide_window_slice_1D(biases_slice);
+ }
+
+ // Run kernel
+ enqueue(queue, *this, in_slice);
+ }
+ while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}
diff --git a/src/core/NEON/kernels/NEHOGNonMaximaSuppressionKernel.cpp b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
similarity index 84%
rename from src/core/NEON/kernels/NEHOGNonMaximaSuppressionKernel.cpp
rename to src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
index 4ab3ddf..62bfdd6 100644
--- a/src/core/NEON/kernels/NEHOGNonMaximaSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/NEHOGNonMaximaSuppressionKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -39,17 +39,17 @@
}
} // namespace
-NEHOGNonMaximaSuppressionKernel::NEHOGNonMaximaSuppressionKernel()
+CPPDetectionWindowNonMaximaSuppressionKernel::CPPDetectionWindowNonMaximaSuppressionKernel()
: _input_output(nullptr), _min_distance(0.0f)
{
}
-bool NEHOGNonMaximaSuppressionKernel::is_parallelisable() const
+bool CPPDetectionWindowNonMaximaSuppressionKernel::is_parallelisable() const
{
return false;
}
-void NEHOGNonMaximaSuppressionKernel::configure(IDetectionWindowArray *input_output, float min_distance)
+void CPPDetectionWindowNonMaximaSuppressionKernel::configure(IDetectionWindowArray *input_output, float min_distance)
{
ARM_COMPUTE_ERROR_ON(nullptr == input_output);
@@ -59,7 +59,7 @@
IKernel::configure(Window()); // Default 1 iteration window
}
-void NEHOGNonMaximaSuppressionKernel::run(const Window &window)
+void CPPDetectionWindowNonMaximaSuppressionKernel::run(const Window &window)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
@@ -68,12 +68,12 @@
const size_t num_candidates = _input_output->num_values();
size_t num_detections = 0;
- /* Sort list of candidates */
+ // Sort list of candidates
std::sort(_input_output->buffer(), _input_output->buffer() + num_candidates, compare_detection_window);
const float min_distance_pow2 = _min_distance * _min_distance;
- /* Euclidean distance */
+ // Euclidean distance
for(size_t i = 0; i < num_candidates; ++i)
{
if(0.0f != _input_output->at(i).score)
@@ -86,7 +86,7 @@
cur.idx_class = _input_output->at(i).idx_class;
cur.score = _input_output->at(i).score;
- /* Store window */
+ // Store window
_input_output->at(num_detections) = cur;
++num_detections;
@@ -108,7 +108,7 @@
if(d < min_distance_pow2)
{
- /* Invalidate keypoint */
+ // Invalidate keypoint
_input_output->at(k).score = 0.0f;
}
}
diff --git a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
index d3e39dc..09d3ccf 100644
--- a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
+++ b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
@@ -60,7 +60,7 @@
_output = output;
_min_distance = min_distance * min_distance; // We compare squares of distances
_num_corner_candidates = num_corner_candidates;
- INEKernel::configure(Window()); // Default 1 iteration window
+ ICPPKernel::configure(Window()); // Default 1 iteration window
}
bool CPPSortEuclideanDistanceKernel::is_parallelisable() const
@@ -71,7 +71,7 @@
void CPPSortEuclideanDistanceKernel::run(const Window &window)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICPPKernel::window(), window);
const int32_t num_corner_candidates = *_num_corner_candidates;
diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index ad4f343..389e390 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp
@@ -25,6 +25,7 @@
#include <cstdarg>
#include <cstdio>
+#include <iostream>
#include <stdexcept>
void arm_compute::error(const char *function, const char *file, const int line, const char *msg, ...)
@@ -38,3 +39,14 @@
throw std::runtime_error(std::string(out));
}
+
+void arm_compute::debug(const char *function, const char *file, const int line, const char *msg, ...)
+{
+ char out[512];
+ va_list args;
+ va_start(args, msg);
+ int offset = snprintf(out, sizeof(out), "in %s %s:%d: ", function, file, line);
+ vsnprintf(out + offset, sizeof(out) - offset, msg, args);
+ va_end(args);
+ std::cout << std::string(out) << std::endl;
+}
diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index 9d93459..ff903e9 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp
@@ -25,7 +25,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/IKernel.h"
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/Utils.h"
#include <algorithm>
@@ -33,7 +33,7 @@
using namespace arm_compute;
-Window arm_compute::calculate_max_window(const TensorInfo &info, const Steps &steps, bool skip_border, BorderSize border_size)
+Window arm_compute::calculate_max_window(const ITensorInfo &info, const Steps &steps, bool skip_border, BorderSize border_size)
{
if(!skip_border)
{
@@ -76,7 +76,45 @@
return window;
}
-Window arm_compute::calculate_max_window_horizontal(const TensorInfo &info, const Steps &steps, bool skip_border, BorderSize border_size)
+Window arm_compute::calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps, BorderSize border_size)
+{
+ const Coordinates &anchor = info.valid_region().anchor;
+ const TensorShape &shape = info.valid_region().shape;
+
+ Window window;
+
+ window.set(0, Window::Dimension(
+ // move the anchor to the start from the border
+ anchor[0] - border_size.left,
+ // move the anchor to include the right end border
+ // Make sure the window width is a multiple of the step size
+ anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
+ steps[0]));
+
+ size_t n = 1;
+ const TensorShape &tensor_shape = info.tensor_shape();
+
+ if(tensor_shape.num_dimensions() > 1)
+ {
+ window.set(1, Window::Dimension(
+ // Include the border above the image
+ anchor[1] - border_size.top,
+ // Include the border below the image
+ anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
+ steps[1]));
+
+ ++n;
+ }
+
+ for(; n < Coordinates::num_max_dimensions; ++n)
+ {
+ window.set(n, Window::Dimension(0, std::max<size_t>(1, tensor_shape[n])));
+ }
+
+ return window;
+}
+
+Window arm_compute::calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps, bool skip_border, BorderSize border_size)
{
if(skip_border)
{
diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
index cbc59e6..4ddc0fe 100644
--- a/src/core/IAccessWindow.cpp
+++ b/src/core/IAccessWindow.cpp
@@ -56,7 +56,10 @@
// Additionally the valid region is shifted by the offset that is used by
// the kernel to write back output values.
anchor.set(0, std::max<int>(window.x().start() * _scale_x, anchor[0] + border_size.left) + _x);
- anchor.set(1, std::max<int>(window.y().start() * _scale_y, anchor[1] + border_size.top) + _y);
+ if(_info->num_dimensions() > 1)
+ {
+ anchor.set(1, std::max<int>(window.y().start() * _scale_y, anchor[1] + border_size.top) + _y);
+ }
// End of the valid region is equal to the start of the last write of the
// kernel plus the number of written elements. (This assumes that all
@@ -67,7 +70,10 @@
// execution window. Afterwards the new end points are converted back into
// a size of the region.
shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right, (window.x().end() - window.x().step()) * _scale_x + _width) - anchor[0]);
- shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom, (window.y().end() - window.y().step()) * _scale_y + _height) - anchor[1]);
+ if(_info->num_dimensions() > 1)
+ {
+ shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom, (window.y().end() - window.y().step()) * _scale_y + _height) - anchor[1]);
+ }
// For higher dimensions use the intersection of the window size and the
// valid region of the input
diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index 04b827e..0b29eca 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp
@@ -39,8 +39,8 @@
return;
}
- const TensorInfo *src_info = src.info();
- TensorInfo *dst_info = this->info();
+ const ITensorInfo *src_info = src.info();
+ ITensorInfo *dst_info = this->info();
ARM_COMPUTE_ERROR_ON(src_info->num_dimensions() > dst_info->num_dimensions());
ARM_COMPUTE_ERROR_ON(src_info->num_channels() != dst_info->num_channels());
diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
index b3d9c6d..edb0a0f 100644
--- a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
@@ -132,6 +132,20 @@
void NEAbsoluteDifferenceKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+ set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+ if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output->info(), Format::S16);
+ }
+ else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::U8);
+ }
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp
index 88e9b86..e5b933a 100644
--- a/src/core/NEON/kernels/NEAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEAccumulateKernel.cpp
@@ -248,8 +248,15 @@
void NEAccumulateKernel::configure(const ITensor *input, ITensor *accum)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
+
+ set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
+
+ set_format_if_unknown(*accum->info(), Format::S16);
+
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
constexpr unsigned int num_elems_processed_per_iteration = 16;
INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
@@ -276,6 +283,13 @@
void NEAccumulateWeightedKernel::configure(const ITensor *input, float alpha, ITensor *accum)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
+
+ set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
+
+ set_format_if_unknown(*accum->info(), Format::U8);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0);
@@ -311,6 +325,13 @@
void NEAccumulateSquaredKernel::configure(const ITensor *input, uint32_t shift, ITensor *accum)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
+
+ set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
+
+ set_format_if_unknown(*accum->info(), Format::S16);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
ARM_COMPUTE_ERROR_ON(shift > 15);
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index e7166b0..a878078 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -23,8 +23,10 @@
*/
#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/NEON/NEMath.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
@@ -33,6 +35,7 @@
#include <arm_neon.h>
#include <array>
+#include <cmath>
#include <map>
using namespace arm_compute;
@@ -44,39 +47,71 @@
void NEActivationLayerKernel::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map =
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+ // Activation functions : FP32
+ static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 =
{
- { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS> },
- { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR> },
- { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC> },
- { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU> },
- { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU> },
- { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU> },
- { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT> },
- { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE> },
- { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH> },
+ { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, float> },
+ { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, float> },
+ { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, float> },
+ { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, float> },
+ { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, float> },
+ { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, float> },
+ { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, float> },
+ { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, float> },
+ { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, float> },
};
+
+ // Activation functions : QS8
+ static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs8 =
+ {
+ { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, qint8_t> },
+ { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, qint8_t> },
+ { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qint8_t> },
+ { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qint8_t> },
+ { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qint8_t> },
+ { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, qint8_t> },
+ { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, qint8_t> },
+ { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint8_t> },
+ { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint8_t> },
+ };
+
_input = input;
_output = output;
- _func = act_map[activation_info.activation()];
_act_info = activation_info;
+ switch(input->info()->data_type())
+ {
+ case DataType::F32:
+ _func = act_map_f32[activation_info.activation()];
+ break;
+ case DataType::QS8:
+ _func = act_map_qs8[activation_info.activation()];
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ }
constexpr unsigned int num_elems_processed_per_iteration = 16;
INESimpleKernel::configure(_input, _output, num_elems_processed_per_iteration);
}
-template <ActivationLayerInfo::ActivationFunction F>
-void NEActivationLayerKernel::activation(const Window &window)
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, float>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
{
Iterator input(_input, window);
Iterator output(_output, window);
- static const float32x4_t CONST_1 = vdupq_n_f32(1.f); // 1.f
+ static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
const float32x4_t a = vdupq_n_f32(_act_info.a());
const float32x4_t b = vdupq_n_f32(_act_info.b());
@@ -199,6 +234,64 @@
input, output);
}
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+{
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+ int fixed_point_position = _input->info()->fixed_point_position();
+
+ static const qint8x16_t CONST_0 = vdupq_n_qs8(0);
+ const qint8x16_t CONST_1 = vdupq_n_qs8(scvt_qs8_f32(1.f, fixed_point_position));
+ const qint8x16_t a = vdupq_n_qs8(scvt_qs8_f32(_act_info.a(), fixed_point_position));
+ const qint8x16_t b = vdupq_n_qs8(scvt_qs8_f32(_act_info.b(), fixed_point_position));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+ const qint8x16_t in = vld1q_qs8(input_ptr);
+ qint8x16_t tmp = {};
+
+ switch(F)
+ {
+ case ActivationFunction::ABS:
+ tmp = vqabsq_qs8(in);
+ break;
+ case ActivationFunction::BOUNDED_RELU:
+ tmp = vminq_qs8(a, vmaxq_qs8(CONST_0, in));
+ break;
+ case ActivationFunction::LINEAR:
+ tmp = vqmlaq_qs8(b, a, in, fixed_point_position);
+ break;
+ case ActivationFunction::LOGISTIC:
+ tmp = vrecipq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(vnegq_s8(in), fixed_point_position)), fixed_point_position);
+ break;
+ case ActivationFunction::RELU:
+ tmp = vmaxq_qs8(CONST_0, in);
+ break;
+ case ActivationFunction::SOFT_RELU:
+ tmp = vlogq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(in, fixed_point_position)), fixed_point_position);
+ break;
+ case ActivationFunction::SQRT:
+ tmp = vrecipq_qs8(vinvsqrtq_qs8(in, fixed_point_position), fixed_point_position);
+ break;
+ case ActivationFunction::SQUARE:
+ tmp = vqmulq_qs8(in, in, fixed_point_position);
+ break;
+ case ActivationFunction::TANH:
+ tmp = vtanhq_qs8(in, fixed_point_position);
+ break;
+ default:
+ break;
+ }
+
+ vst1q_qs8(output_ptr, tmp);
+ },
+ input, output);
+}
+
void NEActivationLayerKernel::run(const Window &window)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index 55c4b76..a4fdad8 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -30,6 +30,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+#include <algorithm>
#include <arm_neon.h>
#include <cstdint>
#include <map>
@@ -291,6 +292,20 @@
void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+ set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+ if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output->info(), Format::S16);
+ }
+ else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32);
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index f6ee1d1..d3e62b0 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -29,6 +29,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+#include <algorithm>
#include <arm_neon.h>
#include <cstdint>
#include <map>
@@ -284,11 +285,23 @@
void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+ set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+ if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+ {
+ set_format_if_unknown(*output->info(), Format::S16);
+ }
+ else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+ {
+ set_format_if_unknown(*output->info(), Format::F32);
+ }
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32);
-
- /* If one of the inputs is 16bit then the output must be 16bit too: */
ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
"Output can only be U8 if both inputs are U8");
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
new file mode 100644
index 0000000..9a216ae
--- /dev/null
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
+ : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon()
+{
+}
+
+void batch_normalization_q8(const ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+{
+ Iterator input(in, window);
+ Iterator output(out, window);
+
+ // Hold information about the current feature map we are iterating.
+ // Only compute denominator and NEON vectors once per feature map.
+ int slice = -1;
+
+ int fixed_point_position = in->info()->fixed_point_position();
+ const auto input_mean = reinterpret_cast<const qint8_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const qint8_t *>(var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma = reinterpret_cast<const qint8_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
+ const auto input_beta = reinterpret_cast<const qint8_t *>(beta->ptr_to_element(Coordinates(0, 0)));
+
+ qint8x16_t mean_vec = vdupq_n_qs8(0);
+ qint8x16_t var_vec = vdupq_n_qs8(0);
+ qint8x16_t gamma_vec = vdupq_n_qs8(0);
+ qint8x16_t beta_vec = vdupq_n_qs8(0);
+ qint8x16_t denominator = vdupq_n_qs8(0);
+ const qint8x16_t epsilon_vec = vdupq_n_qs8(scvt_qs8_f32(epsilon, fixed_point_position));
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ if(slice != id.z())
+ {
+ // Conctruct vectors
+ mean_vec = vdupq_n_qs8(*(input_mean + id.z()));
+ var_vec = vdupq_n_qs8(*(input_var + id.z()));
+ gamma_vec = vdupq_n_qs8(*(input_gamma + id.z()));
+ beta_vec = vdupq_n_qs8(*(input_beta + id.z()));
+
+ // Calculate denominator
+ denominator = vqinvsqrtq_qs8(vqaddq_qs8(var_vec, epsilon_vec), fixed_point_position);
+ slice = id.z();
+ }
+
+ // Calculate x bar and store results
+ const qint8x16_t numerator = vqsubq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), mean_vec);
+ const qint8x16_t x_bar = vqmulq_qs8(numerator, denominator, fixed_point_position);
+ vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmlaq_qs8(beta_vec, x_bar, gamma_vec, fixed_point_position));
+ },
+ input, output);
+}
+
+void batch_normalization_fp32(const ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+{
+ Iterator input(in, window);
+ Iterator output(out, window);
+
+ // Hold information about the current feature map we are iterating.
+ // Only compute denominator and NEON vectors once per feature map.
+ int slice = -1;
+
+ const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma = reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0)));
+ const auto input_beta = reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0)));
+
+ float32x4_t mean_vec = vdupq_n_f32(0.0);
+ float32x4_t var_vec = vdupq_n_f32(0.0);
+ float32x4_t gamma_vec = vdupq_n_f32(0.0);
+ float32x4_t beta_vec = vdupq_n_f32(0.0);
+ float32x4_t denominator = vdupq_n_f32(0.0);
+ const float32x4_t epsilon_vec = vdupq_n_f32(epsilon);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ if(slice != id.z())
+ {
+ // Conctruct vectors
+ mean_vec = vdupq_n_f32(*(input_mean + id.z()));
+ var_vec = vdupq_n_f32(*(input_var + id.z()));
+ gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
+ beta_vec = vdupq_n_f32(*(input_beta + id.z()));
+
+ // Calculate denominator
+ denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
+ slice = id.z();
+ }
+
+ // Calculate x bar and store results
+ const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
+ const float32x4_t x_bar = vmulq_f32(numerator, denominator);
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), vmlaq_f32(beta_vec, x_bar, gamma_vec));
+ },
+ input, output);
+}
+
+void NEBatchNormalizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+ _input = input;
+ _output = output;
+ _mean = mean;
+ _var = var;
+ _gamma = gamma;
+ _beta = beta;
+ _epsilon = epsilon;
+
+ unsigned int num_elems_processed_per_iteration = 0;
+
+ switch(input->info()->data_type())
+ {
+ case DataType::QS8:
+ _func = &batch_normalization_q8;
+ num_elems_processed_per_iteration = 16;
+ break;
+ case DataType::F32:
+ _func = &batch_normalization_fp32;
+ num_elems_processed_per_iteration = 4;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Element size not supported");
+ break;
+ }
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ INEKernel::configure(win);
+}
+
+void NEBatchNormalizationLayerKernel::run(const Window &window)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+ (*_func)(_input, _output, _mean, _var, _beta, _gamma, _epsilon, window);
+}
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index dfde4b4..e8e448e 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -56,9 +56,19 @@
void NEBitwiseAndKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+ set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+ set_format_if_unknown(*output->info(), Format::U8);
+ set_format_if_unknown(*input1->info(), Format::U8);
+ set_format_if_unknown(*input2->info(), Format::U8);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
_input1 = input1;
_input2 = input2;
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index 246644c..bf75592 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -55,8 +55,17 @@
void NEBitwiseNotKernel::configure(const ITensor *input, ITensor *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ set_format_if_unknown(*output->info(), Format::U8);
+ set_format_if_unknown(*input->info(), Format::U8);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
_input = input;
_output = output;
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index d974202..f184be2 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -56,9 +56,19 @@
void NEBitwiseOrKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+ set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+ set_format_if_unknown(*output->info(), Format::U8);
+ set_format_if_unknown(*input1->info(), Format::U8);
+ set_format_if_unknown(*input2->info(), Format::U8);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
_input1 = input1;
_input2 = input2;
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index 20873b2..c4fb4c0 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -56,9 +56,19 @@
void NEBitwiseXorKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+ set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+ set_format_if_unknown(*output->info(), Format::U8);
+ set_format_if_unknown(*input1->info(), Format::U8);
+ set_format_if_unknown(*input2->info(), Format::U8);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
_input1 = input1;
_input2 = input2;
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
index 24b7696..d7e6d73 100644
--- a/src/core/NEON/kernels/NEBox3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
@@ -112,8 +112,17 @@
void NEBox3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ set_format_if_unknown(*input->info(), Format::U8);
+ set_format_if_unknown(*output->info(), Format::U8);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
_input = input;
_output = output;
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index b722b13..85a2cd5 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
@@ -725,6 +725,16 @@
void NEGradientFP16Kernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
+
+ set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
+ set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
+
+ Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
+ set_format_if_unknown(*magnitude->info(), magnitude_format);
+ set_format_if_unknown(*phase->info(), Format::U8);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
@@ -1604,11 +1614,21 @@
void NEGradientKernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
+
+ set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
+ set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
+
+ Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
+ set_format_if_unknown(*magnitude->info(), magnitude_format);
+ set_format_if_unknown(*phase->info(), Format::U8);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(gy->info()->data_type()), "Gx and Gy must have the same element size");
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy");
_gx = gx;
@@ -1687,9 +1707,18 @@
void NEEdgeNonMaxSuppressionKernel::configure(const ITensor *magnitude, const ITensor *phase, ITensor *output,
int32_t upper_thr, int32_t lower_thr, bool border_undefined)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(magnitude, phase, output);
+
+ set_shape_if_empty(*output->info(), magnitude->info()->tensor_shape());
+
+ set_format_if_unknown(*phase->info(), Format::U8);
+ set_format_if_unknown(*output->info(), Format::U8);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(magnitude, phase, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(phase, output);
_magnitude = magnitude;
_phase = phase;
@@ -1765,8 +1794,17 @@
void NEEdgeTraceKernel::configure(ITensor *input, ITensor *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ set_format_if_unknown(*input->info(), Format::U8);
+ set_format_if_unknown(*output->info(), Format::U8);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
_input = input;
_output = output;
diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
index b894683..3147a69 100644
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
@@ -51,13 +51,33 @@
void NEChannelCombineKernel::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON(nullptr == output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
ARM_COMPUTE_ERROR_ON(plane0 == output);
ARM_COMPUTE_ERROR_ON(plane1 == output);
ARM_COMPUTE_ERROR_ON(plane2 == output);
+
+ set_format_if_unknown(*plane0->info(), Format::U8);
+ set_format_if_unknown(*plane1->info(), Format::U8);
+ set_format_if_unknown(*plane2->info(), Format::U8);
+
+ if(plane3 != nullptr)
+ {
+ set_format_if_unknown(*plane3->info(), Format::U8);
+ }
+
+ set_shape_if_empty(*output->info(), plane0->info()->tensor_shape());
+
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane1, plane2);
+
+ if(plane3 != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane0, plane3);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane3);
+ }
const Format &output_format = output->info()->format();
@@ -102,6 +122,14 @@
break;
}
+ TensorShape subsampled_shape_plane1{ plane0->info()->tensor_shape() };
+ subsampled_shape_plane1.set(0, subsampled_shape_plane1[0] / _x_subsampling[1]);
+ TensorShape subsampled_shape_plane2{ plane0->info()->tensor_shape() };
+ subsampled_shape_plane2.set(0, subsampled_shape_plane2[0] / _x_subsampling[2]);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane1->info()->tensor_shape(), subsampled_shape_plane1);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane2->info()->tensor_shape(), subsampled_shape_plane2);
+
Window win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration));
AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
@@ -134,14 +162,55 @@
void NEChannelCombineKernel::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
- ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+ set_format_if_unknown(*plane0->info(), Format::U8);
+ set_format_if_unknown(*plane1->info(), Format::U8);
+ set_format_if_unknown(*plane2->info(), Format::U8);
+
+ set_shape_if_empty(*output->plane(0)->info(), plane0->info()->tensor_shape());
+
+ switch(output->info()->format())
+ {
+ case Format::NV12:
+ case Format::NV21:
+ case Format::IYUV:
+ {
+ TensorShape subsampled_shape = plane0->info()->tensor_shape();
+ subsampled_shape.set(0, subsampled_shape[0] / 2);
+ subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+ set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->plane(1)->info()->tensor_shape(), subsampled_shape);
+
+ if(output->info()->format() == Format::IYUV)
+ {
+ set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->plane(2)->info()->tensor_shape(), subsampled_shape);
+ }
+ break;
+ }
+ case Format::YUV444:
+ set_shape_if_empty(*output->plane(1)->info(), plane0->info()->tensor_shape());
+ set_shape_if_empty(*output->plane(2)->info(), plane0->info()->tensor_shape());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane1, plane2, output->plane(1), output->plane(2));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported format");
+ }
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane0, output->plane(0));
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane1, plane2);
_planes[0] = plane0;
_planes[1] = plane1;
diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
index e772fa1..ebc4b85 100644
--- a/src/core/NEON/kernels/NEChannelExtractKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
@@ -51,23 +51,37 @@
void NEChannelExtractKernel::configure(const ITensor *input, Channel channel, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON(nullptr == input);
- ARM_COMPUTE_ERROR_ON(nullptr == output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_ON(input == output);
- ARM_COMPUTE_ERROR_ON(Format::U8 != output->info()->format());
+
+ set_format_if_unknown(*output->info(), Format::U8);
+
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
unsigned int num_elems_processed_per_iteration = 8;
// Check format and channel
- const Format format = input->info()->format();
- const unsigned int subsampling(((Format::YUYV422 == format || Format::UYVY422 == format) && Channel::Y != channel) ? 2 : 1);
+ const Format format = input->info()->format();
+ const unsigned int subsampling = (format == Format::YUYV422 || format == Format::UYVY422) && channel != Channel::Y ? 2 : 1;
+ TensorShape output_shape;
switch(format)
{
case Format::RGB888:
case Format::RGBA8888:
num_elems_processed_per_iteration = 16;
- _func = (Format::RGB888 == format) ? &NEChannelExtractKernel::extract_1C_from_3C_img : &NEChannelExtractKernel::extract_1C_from_4C_img;
+ output_shape = input->info()->tensor_shape();
+
+ if(format == Format::RGB888)
+ {
+ _func = &NEChannelExtractKernel::extract_1C_from_3C_img;
+ }
+ else if(format == Format::RGBA8888)
+ {
+ _func = &NEChannelExtractKernel::extract_1C_from_4C_img;
+ }
+
switch(channel)
{
case Channel::R:
@@ -80,7 +94,7 @@
_lut_index = 2;
break;
case Channel::A:
- if(Format::RGBA8888 == format)
+ if(format == Format::RGBA8888)
{
_lut_index = 3;
_func = &NEChannelExtractKernel::extract_1C_from_4C_img;
@@ -93,6 +107,13 @@
break;
case Format::YUYV422:
case Format::UYVY422:
+ output_shape = input->info()->tensor_shape();
+
+ if(channel != Channel::Y)
+ {
+ output_shape.set(0, output_shape[0] / 2);
+ }
+
switch(channel)
{
case Channel::Y:
@@ -119,6 +140,11 @@
ARM_COMPUTE_ERROR("Not supported format.");
break;
}
+
+ set_shape_if_empty(*output->info(), output_shape);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
_input = input;
_output = output;
@@ -131,16 +157,47 @@
ValidRegion input_valid_region = input->info()->valid_region();
- output_access.set_valid_region(win, ValidRegion(std::move(input_valid_region.anchor), output->info()->tensor_shape()));
+ output_access.set_valid_region(win, ValidRegion(input_valid_region.anchor, output->info()->tensor_shape()));
INEKernel::configure(win);
}
void NEChannelExtractKernel::configure(const IMultiImage *input, Channel channel, IImage *output)
{
- ARM_COMPUTE_ERROR_ON(nullptr == input);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
- ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output));
+
+ set_format_if_unknown(*output->info(), Format::U8);
+
+ switch(input->info()->format())
+ {
+ case Format::NV12:
+ case Format::NV21:
+ case Format::IYUV:
+ switch(channel)
+ {
+ case Channel::Y:
+ set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
+ break;
+ case Channel::U:
+ case Channel::V:
+ set_shape_if_empty(*output->info(), input->plane(1)->info()->tensor_shape());
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(1), output);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported channel for selected format");
+ }
+ break;
+ case Format::YUV444:
+ set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported format");
+ }
+
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
unsigned int num_elems_processed_per_iteration = 32;
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index 72af075..6d370ac 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -36,43 +36,17 @@
using namespace arm_compute;
-NECol2ImKernel::NECol2ImKernel()
- : _input(nullptr), _output(nullptr), _convolved_dims()
+template <typename T>
+void NECol2ImKernel::run_col2im(const Window &window)
{
-}
-
-void NECol2ImKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- _input = input;
- _output = output;
- _convolved_dims = convolved_dims;
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
-
- // The NECol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- INEKernel::configure(win);
-}
-
-void NECol2ImKernel::run(const Window &window)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
const int output_stride_x = _output->info()->strides_in_bytes().x();
const int output_stride_y = _output->info()->strides_in_bytes().y();
const int output_stride_z = _output->info()->strides_in_bytes().z();
Window window_out(window);
- window_out.set(Window::DimX, Window::Dimension(0, 1, 0));
- window_out.set(Window::DimY, Window::Dimension(0, 1, 0));
- window_out.set(Window::DimZ, Window::Dimension(0, 1, 0));
+ window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
// Create iterators
Iterator in(_input, window);
@@ -83,7 +57,68 @@
const int hidx = id.y();
const int idx = id.x() * output_stride_z + (hidx / _convolved_dims.first) * output_stride_y + (hidx % _convolved_dims.first) * output_stride_x;
- *(reinterpret_cast<float *>(out.ptr() + idx)) = *(reinterpret_cast<const float *>(in.ptr()));
+ *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
},
in, out);
}
+
+NECol2ImKernel::NECol2ImKernel()
+ : _func(), _input(nullptr), _output(nullptr), _convolved_dims()
+{
+}
+
+void NECol2ImKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ set_data_type_if_unknown(*output->info(), input->info()->data_type());
+
+ TensorShape output_shape = input->info()->tensor_shape();
+ output_shape.set(0, convolved_dims.first);
+ output_shape.set(1, convolved_dims.second);
+ output_shape.set(2, input->info()->tensor_shape()[0]);
+
+ set_shape_if_empty(*output->info(), output_shape);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ _input = input;
+ _output = output;
+ _convolved_dims = convolved_dims;
+
+ switch(input->info()->element_size())
+ {
+ case 1:
+ _func = &NECol2ImKernel::run_col2im<uint8_t>;
+ break;
+ case 2:
+ _func = &NECol2ImKernel::run_col2im<uint16_t>;
+ break;
+ case 4:
+ _func = &NECol2ImKernel::run_col2im<uint32_t>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Element size not supported");
+ break;
+ }
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+
+ // The NECol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ INEKernel::configure(win);
+}
+
+void NECol2ImKernel::run(const Window &window)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp
index 110a28e..cb5152e 100644
--- a/src/core/NEON/kernels/NEColorConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp
@@ -44,8 +44,11 @@
void NEColorConvertKernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON(input == nullptr);
- ARM_COMPUTE_ERROR_ON(output == nullptr);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
unsigned int num_elems_processed_per_iteration = 0;
@@ -137,9 +140,13 @@
void NEColorConvertKernel::configure(const IMultiImage *input, IImage *output)
{
- ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+ set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
+
unsigned int num_elems_processed_per_iteration = 0;
switch(input->info()->format())
@@ -241,8 +248,50 @@
void NEColorConvertKernel::configure(const IImage *input, IMultiImage *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+ set_shape_if_empty(*output->plane(0)->info(), input->info()->tensor_shape());
+
+ switch(output->info()->format())
+ {
+ case Format::NV12:
+ {
+ TensorShape subsampled_shape = input->info()->tensor_shape();
+ subsampled_shape.set(0, subsampled_shape[0] / 2);
+ subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+ set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+ break;
+ }
+ case Format::IYUV:
+ {
+ TensorShape subsampled_shape = input->info()->tensor_shape();
+ subsampled_shape.set(0, subsampled_shape[0] / 2);
+ subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+ set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+ set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(2)->info()->tensor_shape());
+ break;
+ }
+ case Format::YUV444:
+ set_shape_if_empty(*output->plane(1)->info(), input->info()->tensor_shape());
+ set_shape_if_empty(*output->plane(2)->info(), input->info()->tensor_shape());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(1));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(2));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(0));
+
unsigned int num_elems_processed_per_iteration = 0;
switch(input->info()->format())
@@ -372,6 +421,50 @@
void NEColorConvertKernel::configure(const IMultiImage *input, IMultiImage *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_ON(input == output);
+
+ set_shape_if_empty(*output->plane(0)->info(), input->plane(0)->info()->tensor_shape());
+
+ switch(output->info()->format())
+ {
+ case Format::NV12:
+ {
+ TensorShape subsampled_shape = input->plane(0)->info()->tensor_shape();
+ subsampled_shape.set(0, subsampled_shape[0] / 2);
+ subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+ set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+ break;
+ }
+ case Format::IYUV:
+ {
+ TensorShape subsampled_shape = input->plane(0)->info()->tensor_shape();
+ subsampled_shape.set(0, subsampled_shape[0] / 2);
+ subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+ set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+ set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(2)->info()->tensor_shape());
+ break;
+ }
+ case Format::YUV444:
+ set_shape_if_empty(*output->plane(1)->info(), input->plane(0)->info()->tensor_shape());
+ set_shape_if_empty(*output->plane(2)->info(), input->plane(0)->info()->tensor_shape());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(1));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(2));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(0));
+
switch(input->info()->format())
{
case Format::NV12:
diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
index b1b26cc..30e91ef 100644
--- a/src/core/NEON/kernels/NEConvolutionKernel.cpp
+++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp
@@ -40,8 +40,8 @@
#include <cstring>
#include <tuple>
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX);
@@ -323,9 +323,13 @@
template <unsigned int matrix_size>
void NEConvolutionKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
+
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON(conv == nullptr);
_input = input;
_output = output;
@@ -358,7 +362,6 @@
INEKernel::configure(win);
}
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
template <>
template <typename OutputType>
void NEConvolutionKernel<3>::convolution(const Window &win)
@@ -616,7 +619,6 @@
},
input, output);
}
-#endif /* DOXYGEN_SKIP_THIS */
template <unsigned int matrix_size>
void NEConvolutionKernel<matrix_size>::run(const Window &window)
@@ -661,9 +663,13 @@
template <unsigned int matrix_size>
void NESeparableConvolutionHorKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_row);
+
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
- ARM_COMPUTE_ERROR_ON(conv_row == nullptr);
_input = input;
_output = output;
@@ -709,9 +715,6 @@
}
}
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
-namespace arm_compute
-{
template <>
template <>
inline void NESeparableConvolutionHorKernel<5>::convolve<uint16_t>(const Window &window)
@@ -1076,8 +1079,6 @@
},
input, output);
}
-} // namespace arm_compute
-#endif
template class arm_compute::NESeparableConvolutionHorKernel<5>;
template class arm_compute::NESeparableConvolutionHorKernel<7>;
@@ -1098,9 +1099,13 @@
template <unsigned int matrix_size>
void NESeparableConvolutionVertKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_col);
+
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON(conv_col == nullptr);
ARM_COMPUTE_ERROR_ON(scale == 0);
_input = input;
@@ -1417,11 +1422,15 @@
void NEConvolutionRectangleKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
+
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON(nullptr == conv);
- ARM_COMPUTE_ERROR_ON(3 != width && 5 != width && 7 != width && 9 != width);
- ARM_COMPUTE_ERROR_ON(3 != height && 5 != height && 7 != height && 9 != height);
+ ARM_COMPUTE_ERROR_ON(width != 3 && width != 5 && width != 7 && width != 9);
+ ARM_COMPUTE_ERROR_ON(height != 3 && height != 5 && height != 7 && height != 9);
ARM_COMPUTE_ERROR_ON(0 == scale);
_input = input;
@@ -1606,3 +1615,4 @@
},
input, output);
}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEConvolutionLayerWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEConvolutionLayerWeightsReshapeKernel.cpp
deleted file mode 100644
index e4ea960..0000000
--- a/src/core/NEON/kernels/NEConvolutionLayerWeightsReshapeKernel.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEConvolutionLayerWeightsReshapeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-using namespace arm_compute;
-
-NEConvolutionLayerWeightsReshapeKernel::NEConvolutionLayerWeightsReshapeKernel()
- : _input(nullptr), _bias(nullptr), _output(nullptr), _has_bias(false)
-{
-}
-
-void NEConvolutionLayerWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- if(bias != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F32);
- }
- ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4);
- ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != input->info()->dimension(1));
-
- _input = input;
- _bias = bias;
- _output = output;
- _has_bias = (bias != nullptr);
-
- // Configure kernel
- Window window = calculate_max_window(*input->info(), Steps());
- window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
- window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
- window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
-
- // The NEConvolutionLayerWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- INEKernel::configure(window);
-}
-
-void NEConvolutionLayerWeightsReshapeKernel::run(const Window &window)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- const unsigned int kernel_size = _input->info()->dimension(0);
- const unsigned int kernel_depth = _input->info()->dimension(2);
- const unsigned int input_stride_x = _input->info()->strides_in_bytes().x();
- const unsigned int input_stride_y = _input->info()->strides_in_bytes().y();
- const unsigned int input_stride_z = _input->info()->strides_in_bytes().z();
- const unsigned int output_stride_y = _output->info()->strides_in_bytes().y();
-
- // Create iterators
- Iterator in(_input, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- // Get column index
- const int kernel_idx = id[3];
-
- // Setup pointers
- auto tmp_input_ptr = in.ptr();
- auto tmp_output_ptr = _output->ptr_to_element(Coordinates(kernel_idx, 0));
- auto curr_input_row_ptr = tmp_input_ptr;
- auto curr_input_depth_ptr = tmp_input_ptr;
-
- // Linearize volume
- for(unsigned int d = 0; d < kernel_depth; ++d)
- {
- for(unsigned int j = 0; j < kernel_size; ++j)
- {
- for(unsigned int i = 0; i < kernel_size; ++i)
- {
- *(reinterpret_cast<float *>(tmp_output_ptr)) = *(reinterpret_cast<float *>(tmp_input_ptr));
- tmp_input_ptr += input_stride_x;
- tmp_output_ptr += output_stride_y;
- }
- curr_input_row_ptr += input_stride_y;
- tmp_input_ptr = curr_input_row_ptr;
- }
- curr_input_depth_ptr += input_stride_z;
- curr_input_row_ptr = curr_input_depth_ptr;
- tmp_input_ptr = curr_input_depth_ptr;
- }
-
- // Add bias
- if(_has_bias)
- {
- *(reinterpret_cast<float *>(tmp_output_ptr)) = *(reinterpret_cast<float *>(_bias->ptr_to_element(Coordinates(kernel_idx, 0))));
- }
- },
- in);
-}
diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
index 0160f1a..32789cb 100644
--- a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
@@ -47,6 +47,26 @@
return false;
}
+void NECumulativeDistributionKernel::configure(const IImage *input, const IDistribution1D *distribution, IDistribution1D *cumulative_sum, ILut *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, distribution, cumulative_sum, output);
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+
+ set_format_if_unknown(*input->info(), Format::U8);
+
+ ARM_COMPUTE_ERROR_ON(distribution->num_bins() != cumulative_sum->num_bins());
+ ARM_COMPUTE_ERROR_ON(distribution->num_bins() != output->num_elements());
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON(input->info()->data_type() != output->type());
+
+ _input = input;
+ _distribution = distribution;
+ _cumulative_sum = cumulative_sum;
+ _output = output;
+
+ INEKernel::configure(calculate_max_window(*input->info()));
+}
+
void NECumulativeDistributionKernel::run(const Window &window)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -88,19 +108,3 @@
}
}
}
-
-void NECumulativeDistributionKernel::configure(const IImage *input, const IDistribution1D *distribution, IDistribution1D *cumulative_sum, ILut *output)
-{
- ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON(nullptr == distribution);
- ARM_COMPUTE_ERROR_ON(nullptr == cumulative_sum);
- ARM_COMPUTE_ERROR_ON(nullptr == output);
-
- _input = input;
- _distribution = distribution;
- _cumulative_sum = cumulative_sum;
- _output = output;
-
- INEKernel::configure(calculate_max_window(*input->info()));
-}
diff --git a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
new file mode 100644
index 0000000..902490e
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+NEDepthConcatenateKernel::NEDepthConcatenateKernel()
+ : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
+{
+}
+
+BorderSize NEDepthConcatenateKernel::border_size() const
+{
+ return BorderSize(_top_bottom, _left_right);
+}
+
+void NEDepthConcatenateKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+ // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+ // Otherwise it is not clear how the padding should be added onto the input tensor
+ ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+ ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+ _input = input;
+ _output = output;
+ _depth_offset = depth_offset;
+ _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+ _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+ const unsigned int num_elems_processed_per_iteration = 4;
+ const unsigned int num_elems_read_per_iteration = 4;
+ const unsigned int num_rows_read_per_iteration = 1;
+
+ // The window needs to be based on input as we copy all the depths of input
+ Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size());
+
+ AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+ INEKernel::configure(win);
+}
+
+void NEDepthConcatenateKernel::run(const Window &window)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ // Offset output
+ const unsigned int offset_to_first_elements_in_bytes = _output->info()->offset_first_element_in_bytes() + _left_right * _output->info()->strides_in_bytes()[0] + _top_bottom *
+ _output->info()->strides_in_bytes()[1] + _depth_offset * _output->info()->strides_in_bytes()[2];
+ uint8_t *output_ptr = _output->buffer() + offset_to_first_elements_in_bytes;
+
+ Iterator input(_input, window);
+ Iterator output(_output, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const float *>(input.ptr());
+ const auto out_ptr = reinterpret_cast<float *>(output_ptr + output.offset());
+
+ vst1q_f32(out_ptr, vld1q_f32(in_ptr));
+ },
+ input, output);
+}
diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
index eae8641..56612a7 100644
--- a/src/core/NEON/kernels/NEDepthConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
@@ -45,32 +46,27 @@
void NEDepthConvertKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32);
ARM_COMPUTE_ERROR_ON(shift >= 8);
ARM_COMPUTE_ERROR_ON(input == output);
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data_types must be different");
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::F32),
+ "Only data_types supported [in] QS8 -> [out] F32");
+
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
- && output->info()->data_type() != DataType::U32
&& output->info()->data_type() != DataType::S32),
- "Only data_types supported [in] U8 -> [out] U16, S16, U32, S32");
+ "Only data_types supported [in] U8 -> [out] U16, S16, S32");
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
- && output->info()->data_type() != DataType::S32),
- "Only data_types supported [in] U16 -> [out] U8, U32, S32");
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
+ "Only data_types supported [in] U16 -> [out] U8, U32");
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
- && output->info()->data_type() != DataType::S32),
- "Only data_types supported [in] S16 -> [out] U8, U32, S32");
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
+ "Only data_types supported [in] S16 -> [out] U8, S32");
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
- && output->info()->data_type() != DataType::S16),
- "Only data_types supported [in] S16 -> [out] U8, U16, S16");
-
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
- && output->info()->data_type() != DataType::S16),
- "Only data_types supported [in] S16 -> [out] U8, U16, S16");
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8),
+ "Only data_types supported [in] F32 -> [out] QS8");
_policy = policy;
_shift = shift;
@@ -92,6 +88,35 @@
switch(_input->info()->data_type())
{
+ case DataType::QS8:
+ {
+ const int fixed_point_position = _input->info()->fixed_point_position();
+
+ switch(_output->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ /* Up-conversion QS8 -> F32 */
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<const int8_t *>(input.ptr()));
+
+ float32x4x2_t texels_low = vcvt_f32_qs8(vget_low_s8(texels_s8), fixed_point_position);
+ float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_s8), fixed_point_position);
+
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), texels_low.val[0]);
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, texels_low.val[1]);
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, texels_high.val[0]);
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, texels_high.val[1]);
+ },
+ input, output);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Output data type not supported");
+ }
+ break;
+ }
case DataType::U8:
{
const int16x8_t b = vdupq_n_s16(_shift);
@@ -121,7 +146,7 @@
}
case DataType::S32:
{
- /* Up-conversion S16 -> S32 */
+ /* Up-conversion U8 -> S32 */
execute_window_loop(window, [&](const Coordinates & id)
{
const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
@@ -214,7 +239,7 @@
}
case DataType::S32:
{
- const int16x8_t b = vdupq_n_s16(_shift);
+ const int32x4_t b = vdupq_n_s32(_shift);
/* Up-conversion S16 -> S32 */
execute_window_loop(window, [&](const Coordinates & id)
@@ -222,15 +247,25 @@
const int16x8x2_t texels =
{
{
- vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())), b),
- vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8), b)
+ vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())),
+ vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8)
}
};
- vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vmovl_s16(vget_low_s16(texels.val[0])));
- vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vmovl_s16(vget_high_s16(texels.val[0])));
- vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vmovl_s16(vget_low_s16(texels.val[1])));
- vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+ const int32x4x4_t texels_s32 =
+ {
+ {
+ vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b),
+ vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b),
+ vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b),
+ vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b)
+ }
+ };
+
+ vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), texels_s32.val[0]);
+ vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, texels_s32.val[1]);
+ vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, texels_s32.val[2]);
+ vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, texels_s32.val[3]);
},
input, output);
break;
@@ -277,7 +312,7 @@
}
};
- vst1q_u8(output.ptr(), vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
+ vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
},
input, output);
}
@@ -311,6 +346,38 @@
}
break;
}
+ case DataType::F32:
+ {
+ switch(_output->info()->data_type())
+ {
+ case DataType::QS8:
+ {
+ const int fixed_point_position = _output->info()->fixed_point_position();
+ /* Down-conversion F32 -> QS8 */
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float32x4x4_t texels_f32 =
+ {
+ {
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
+ }
+ };
+
+ const qint8x16_t texels_s8 = vcvtq_qs8_f32(texels_f32, fixed_point_position);
+
+ vst1q_s8(reinterpret_cast<int8_t *>(output.ptr()), texels_s8);
+ },
+ input, output);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Output data type not supported");
+ }
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Not supported");
}
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
new file mode 100644
index 0000000..effc50e
--- /dev/null
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+// Internal load
+inline float32x4_t internal_vld1q(const float *in)
+{
+ return vld1q_f32(in);
+}
+inline qint8x16_t internal_vld1q(const qint8_t *in)
+{
+ return vld1q_qs8(in);
+}
+inline qint16x8_t internal_vld1q(const qint16_t *in)
+{
+ return vld1q_qs16(in);
+}
+
+// Internal store
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+ vst1q_f32(p, v);
+}
+inline void internal_vst1q(qint8_t *p, const qint8x16_t &v)
+{
+ vst1q_qs8(p, v);
+}
+inline void internal_vst1q(qint8_t *p, const qint16x8_t &v)
+{
+ vst1_qs8(p, vqmovn_s16(v));
+}
+inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
+{
+ vst1q_qs16(p, v);
+}
+
+// Internal vdup
+inline float32x4_t internal_vdupq_n(float v)
+{
+ return vdupq_n_f32(v);
+}
+inline qint8x16_t internal_vdupq_n(qint8_t v)
+{
+ return vdupq_n_qs8(v);
+}
+inline qint16x8_t internal_vdupq_n(qint16_t v)
+{
+ return vdupq_n_qs16(v);
+}
+
+// Internal vadd
+inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
+{
+ return vaddq_f32(x, y);
+}
+inline qint8x16_t internal_vqaddq(const qint8x16_t &x, const qint8x16_t &y)
+{
+ return vqaddq_qs8(x, y);
+}
+inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y)
+{
+ return vqaddq_qs16(x, y);
+}
+
+template <typename T1, typename T2, bool in_place>
+void accumulate_bias(ITensor *input, const ITensor *bias, const Window window, ITensor *output)
+{
+ Iterator in(input, window);
+
+ if(in_place) // In place accumulate
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<T1 *>(in.ptr());
+ const auto vb = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
+
+ // Accumulate bias
+ internal_vst1q(in_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
+ },
+ in);
+ }
+ else // Out of place accumulate
+ {
+ Iterator out(output, window);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<const T1 *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<T2 *>(out.ptr());
+ const auto vb = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
+
+ // Accumulate bias
+ internal_vst1q(out_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
+ },
+ in, out);
+ }
+}
+} // namespace
+
+NEDirectConvolutionLayerBiasAccumulateKernel::NEDirectConvolutionLayerBiasAccumulateKernel()
+ : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(input->info()->fixed_point_position() != bias->info()->fixed_point_position());
+ if(output != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(bias, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(bias, output);
+ }
+ ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
+
+ _func = nullptr;
+ _bias = bias;
+ _input = input;
+ _output = output;
+
+ const unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->info()->data_type());
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowStatic bias_access(bias->info(), 0, 0, bias->info()->dimension(0), bias->info()->dimension(1));
+ if(output != nullptr)
+ {
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access, bias_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ }
+ else
+ {
+ update_window_and_padding(win, input_access, bias_access);
+ input_access.set_valid_region(win, ValidRegion(Coordinates(), input->info()->tensor_shape()));
+ }
+ INEKernel::configure(win);
+
+ // Set appropriate function
+ if(input->info()->data_type() == DataType::F32)
+ {
+ _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
+ }
+ else if(input->info()->data_type() == DataType::QS8)
+ {
+ _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
+ }
+ else if(input->info()->data_type() == DataType::QS16 && bias->info()->data_type() == DataType::QS8)
+ {
+ _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+ }
+}
+
+void NEDirectConvolutionLayerBiasAccumulateKernel::run(const Window &window)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+ (*_func)(_input, _bias, window, _output);
+}
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
new file mode 100644
index 0000000..d608898
--- /dev/null
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -0,0 +1,817 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace
+{
+template <unsigned int stridex>
+float32x4_t internal_vld1q(const float *in);
+
+template <>
+float32x4_t internal_vld1q<1>(const float *in)
+{
+ return vld1q_f32(in);
+}
+
+template <>
+float32x4_t internal_vld1q<2>(const float *in)
+{
+ const float32x4x2_t tmp = vld2q_f32(in);
+ return tmp.val[0];
+}
+
+template <>
+float32x4_t internal_vld1q<3>(const float *in)
+{
+ const float32x4x3_t tmp = vld3q_f32(in);
+ return tmp.val[0];
+}
+
+template <unsigned int stridex>
+qint8x8_t internal_vld1q(const qint8_t *in);
+
+template <>
+qint8x8_t internal_vld1q<1>(const qint8_t *in)
+{
+ return vld1_qs8(in);
+}
+
+template <>
+qint8x8_t internal_vld1q<2>(const qint8_t *in)
+{
+ const qint8x8x2_t tmp = vld2_s8(in);
+ return tmp.val[0];
+}
+
+template <>
+qint8x8_t internal_vld1q<3>(const qint8_t *in)
+{
+ const qint8x8x3_t tmp = vld3_s8(in);
+ return tmp.val[0];
+}
+
+template <unsigned int stridex>
+qint16x8_t internal_vld1q(const qint16_t *in);
+
+template <>
+qint16x8_t internal_vld1q<1>(const qint16_t *in)
+{
+ return vld1q_s16(in);
+}
+
+inline float32x4_t internal_vdupq_n(float v)
+{
+ return vdupq_n_f32(v);
+}
+
+inline qint8x8_t internal_vdupq_n(qint8_t v)
+{
+ return vdup_n_qs8(v);
+}
+
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+ vst1q_f32(p, v);
+}
+
+inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
+{
+ vst1q_qs16(p, v);
+}
+
+float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+ return vmulq_f32(x, y);
+}
+
+qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
+{
+ return vmull_qs8(x, y, fixed_point_position);
+}
+
+inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+ return vmlaq_f32(x, y, z);
+}
+
+inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
+{
+ return vqmlal_qs8(x, y, z, fixed_point_position);
+}
+
+template <typename T1, typename T2, unsigned int stridex>
+class convolver_1x1
+{
+public:
+ static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+ const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ {
+ const int input_stride_y = input->info()->strides_in_bytes().y();
+ const int input_stride_z = input->info()->strides_in_bytes().z();
+ const int output_stride_y = output->info()->strides_in_bytes().y();
+ const int output_stride_z = output->info()->strides_in_bytes().z();
+ const int kernel_stride_z = weights->info()->strides_in_bytes().z();
+ const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
+ const int output_w = output->info()->dimension(0);
+ const int output_h = output->info()->dimension(1);
+ const int range_z = window.z().end() - window.z().start();
+ const int kernel_depth = weights->info()->dimension(Window::DimZ);
+ const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+ const int fixed_point_position = input->info()->fixed_point_position();
+
+ // setup output window for the iterator
+ Window window_out = window;
+ window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
+ window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+ window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
+
+ // setup input window for the iterator
+ Window window_in = window;
+ // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+ window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+
+ Iterator out(output, window_out);
+ Iterator in(input, window_in);
+ Iterator k(weights, window_k);
+
+ const uint8_t *k_ptr = k.ptr();
+
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ /*
+ For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>
+ */
+ const uint8_t *input_ptr = in.ptr();
+ uint8_t *out_ptr = out.ptr();
+ int ih = 0;
+ int oh = 0;
+ for(int oz = 0; oz < range_z; ++oz)
+ {
+ auto p_out_base = out_ptr + oz * output_stride_z;
+ // Step 1
+ {
+ const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
+ const auto vk = internal_vdupq_n(*k_val);
+ for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+ {
+ const int offset_xy = ih * input_stride_y;
+ auto in_val = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));
+ auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+ for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
+ {
+ internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val), fixed_point_position));
+ }
+ }
+ }
+ // Step 2
+ for(int p = 1; p < kernel_depth; ++p)
+ {
+ const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
+ const auto vk = internal_vdupq_n(*k_val);
+ for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+ {
+ const int offset_xy = ih * input_stride_y;
+ auto in_val = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);
+ auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+ for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
+ {
+ internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val), fixed_point_position));
+ }
+ }
+ }
+ }
+ },
+ in, out);
+ }
+};
+
+inline float32x4x3_t load_matrix_row(const float *ptr)
+{
+ const float32x4x3_t r =
+ {
+ {
+ vld1q_dup_f32(ptr),
+ vld1q_dup_f32(1 + ptr),
+ vld1q_dup_f32(2 + ptr)
+ }
+ };
+ return r;
+}
+inline qint8x8x3_t load_matrix_row(const qint8_t *ptr)
+{
+ /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
+ r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
+ const qint8x8x3_t r =
+ {
+ {
+ vld1_dup_qs8(ptr),
+ vld1_dup_qs8(1 + ptr),
+ vld1_dup_qs8(2 + ptr)
+ }
+ };
+ return r;
+}
+
+template <unsigned int stridex>
+float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position);
+
+template <>
+inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+
+ const float32x4x3_t vtop =
+ {
+ {
+ vld1q_f32(in_top),
+ vld1q_f32(in_top + 4),
+ vld1q_f32(in_top + 8)
+ }
+ };
+ const float32x4x3_t vmid =
+ {
+ {
+ vld1q_f32(in_mid),
+ vld1q_f32(in_mid + 4),
+ vld1q_f32(in_mid + 8)
+ }
+ };
+ const float32x4x3_t vlow =
+ {
+ {
+ vld1q_f32(in_low),
+ vld1q_f32(in_low + 4),
+ vld1q_f32(in_low + 8)
+ }
+ };
+ float32x4x2_t out =
+ {
+ {
+ vmulq_f32(vtop.val[0], m0.val[0]),
+ vmulq_f32(vtop.val[1], m0.val[0])
+ }
+ };
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+ out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
+ out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
+ out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
+ out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
+ out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
+ return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+ float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+ out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
+ out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
+ out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
+ return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+ float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+ out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
+ return out;
+}
+
+template <unsigned int stridex>
+qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position);
+
+template <>
+inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+
+ const qint8x8x3_t vtop =
+ {
+ {
+ vld1_qs8(in_top),
+ vld1_qs8(in_top + 8),
+ vld1_qs8(in_top + 16)
+ }
+ };
+ const qint8x8x3_t vmid =
+ {
+ {
+ vld1_qs8(in_mid),
+ vld1_qs8(in_mid + 8),
+ vld1_qs8(in_mid + 16)
+ }
+ };
+ const qint8x8x3_t vlow =
+ {
+ {
+ vld1_qs8(in_low),
+ vld1_qs8(in_low + 8),
+ vld1_qs8(in_low + 16)
+ }
+ };
+ qint16x8x2_t out =
+ {
+ {
+ vmull_qs8(vtop.val[0], m0.val[0], fixed_point_position),
+ vmull_qs8(vtop.val[1], m0.val[0], fixed_point_position)
+ }
+ };
+ out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 1), m0.val[1], fixed_point_position);
+ out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 2), m0.val[2], fixed_point_position);
+ out.val[0] = vqmlal_qs8(out.val[0], vmid.val[0], m1.val[0], fixed_point_position);
+ out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 1), m1.val[1], fixed_point_position);
+ out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 2), m1.val[2], fixed_point_position);
+ out.val[0] = vqmlal_qs8(out.val[0], vlow.val[0], m2.val[0], fixed_point_position);
+ out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 1), m2.val[1], fixed_point_position);
+ out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 2), m2.val[2], fixed_point_position);
+ out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 1), m0.val[1], fixed_point_position);
+ out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 2), m0.val[2], fixed_point_position);
+ out.val[1] = vqmlal_qs8(out.val[1], vmid.val[1], m1.val[0], fixed_point_position);
+ out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 1), m1.val[1], fixed_point_position);
+ out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 2), m1.val[2], fixed_point_position);
+ out.val[1] = vqmlal_qs8(out.val[1], vlow.val[1], m2.val[0], fixed_point_position);
+ out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 1), m2.val[1], fixed_point_position);
+ out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 2), m2.val[2], fixed_point_position);
+ return out;
+}
+
+template <>
+inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+ qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+ out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1);
+ out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2);
+ out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3);
+ out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 0), out.val[0], 4);
+ out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 2), out.val[0], 5);
+ out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 4), out.val[0], 6);
+ out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 6), out.val[0], 7);
+ return out;
+}
+
+template <>
+inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+ qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+ out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1);
+ out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2);
+ out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3);
+ return out;
+}
+
+template <unsigned int stridex>
+void store_results(float *buffer, const float32x4x2_t &values);
+
+template <>
+void store_results<1>(float *buffer, const float32x4x2_t &values)
+{
+ vst1q_f32(buffer, values.val[0]);
+ vst1q_f32(buffer + 4, values.val[1]);
+}
+
+template <>
+void store_results<2>(float *buffer, const float32x4x2_t &values)
+{
+ vst1q_f32(buffer, values.val[0]);
+}
+
+template <>
+void store_results<3>(float *buffer, const float32x4x2_t &values)
+{
+ vst1_f32(buffer, vget_low_f32(values.val[0]));
+}
+
+template <unsigned int stridex>
+void store_results(qint16_t *buffer, const qint16x8x2_t &values);
+
+template <>
+void store_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+ vst1q_qs16(buffer, values.val[0]);
+ vst1q_qs16(buffer + 8, values.val[1]);
+}
+
+template <>
+void store_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+ vst1q_qs16(buffer, values.val[0]);
+}
+
+template <>
+void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+ vst1_qs16(buffer, vget_low_s16(values.val[0]));
+}
+
+template <unsigned int stridex>
+void accumulate_results(float *buffer, const float32x4x2_t &values);
+
+template <>
+void accumulate_results<1>(float *buffer, const float32x4x2_t &values)
+{
+ vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
+ vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1]));
+}
+
+template <>
+void accumulate_results<2>(float *buffer, const float32x4x2_t &values)
+{
+ vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
+}
+
+template <>
+void accumulate_results<3>(float *buffer, const float32x4x2_t &values)
+{
+ vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));
+}
+
+template <unsigned int stridex>
+void accumulate_results(qint16_t *buffer, const qint16x8x2_t &values);
+
+template <>
+void accumulate_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+ vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));
+ vst1q_qs16(buffer + 8, vqaddq_qs16(vld1q_qs16(buffer + 8), values.val[1]));
+}
+
+template <>
+void accumulate_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+ vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));
+}
+
+template <>
+void accumulate_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+ vst1_qs16(buffer, vqadd_qs16(vld1_qs16(buffer), vget_low_s16(values.val[0])));
+}
+
+template <unsigned int stridex>
+int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration);
+
+template <>
+int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration)
+{
+ return num_elems_written_per_iteration;
+}
+
+template <>
+int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration)
+{
+ return num_elems_written_per_iteration << 1;
+}
+
+template <>
+int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration)
+{
+ return num_elems_written_per_iteration * 3;
+}
+
+template <typename T1, typename T2, unsigned int stridex>
+class convolver_3x3
+{
+public:
+ static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+ const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+ {
+ ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
+ const int input_stride_x = input->info()->strides_in_bytes().x();
+ const int input_stride_y = input->info()->strides_in_bytes().y();
+ const int input_stride_z = input->info()->strides_in_bytes().z();
+ const int output_stride_y = output->info()->strides_in_bytes().y();
+ const int output_stride_z = output->info()->strides_in_bytes().z();
+ const int kernel_stride_x = weights->info()->strides_in_bytes().x();
+ const int kernel_stride_y = weights->info()->strides_in_bytes().y();
+ const int kernel_stride_z = weights->info()->strides_in_bytes().z();
+ const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
+ const int output_w = output->info()->dimension(0);
+ const int output_h = output->info()->dimension(1);
+ const int num_planes_z = window.z().end() - window.z().start();
+ const int delta_input = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+ const int kernel_depth = weights->info()->dimension(Window::DimZ);
+ const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+ const unsigned int conv_pad_x = std::get<0>(conv_info.pad());
+ const unsigned int conv_pad_y = std::get<1>(conv_info.pad());
+ const int fixed_point_position = input->info()->fixed_point_position();
+
+ // setup output window for the iterator
+ Window window_out = window;
+ window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
+ window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+ window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
+
+ // setup input window for the iterator
+ Window window_in = window;
+ // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+ window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+
+ Iterator out(output, window_out);
+ Iterator in(input, window_in);
+ Iterator k(weights, window_k);
+
+ const uint8_t *k_ptr = k.ptr();
+
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
+ uint8_t *out_ptr = out.ptr();
+ int ih = 0;
+ int oh = 0;
+ /*
+ Each thread executing this kernel computes one or more output's volume planes.
+
+ Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],
+ the third thread [16,24] and the fourth thread [25,31].
+
+ The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this
+ is that we setup the neon registers containing the kernerl's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.
+
+ The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:
+ 1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.
+ 2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.
+ */
+
+ for(int oz = 0; oz < num_planes_z; ++oz)
+ {
+ uint8_t *p_out_base = out_ptr + oz * output_stride_z;
+ // Step 1
+ {
+ const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto vk_r0 = load_matrix_row(ptr_k_r0);
+ const auto vk_r1 = load_matrix_row(ptr_k_r1);
+ const auto vk_r2 = load_matrix_row(ptr_k_r2);
+ for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+ {
+ auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
+ auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
+ auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
+ auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+ for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+ in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
+ {
+ auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);
+ store_results<stridex>(p_out, vres);
+ }
+ }
+ }
+ // Step 2
+ for(int p = 1; p < kernel_depth; ++p)
+ {
+ const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto vk_r0 = load_matrix_row(ptr_k_r0);
+ const auto vk_r1 = load_matrix_row(ptr_k_r1);
+ const auto vk_r2 = load_matrix_row(ptr_k_r2);
+ for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+ {
+ auto in_top = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
+ auto in_mid = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
+ auto in_low = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
+ auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+ for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+ in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
+ {
+ auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);
+ accumulate_results<stridex>(p_out, vres);
+ }
+ }
+ }
+ }
+ },
+ in, out);
+ }
+};
+
+template <typename T1, typename T2>
+inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+ const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+ const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+ switch(conv_stride_x)
+ {
+ case 1:
+ convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ break;
+ case 2:
+ convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ break;
+ case 3:
+ convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+}
+
+template <typename T1, typename T2>
+inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+ const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+ const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+ switch(conv_stride_x)
+ {
+ case 1:
+ convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ break;
+ case 2:
+ convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ break;
+ case 3:
+ convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not implemented");
+ }
+}
+} // namespace
+
+NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel()
+ : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_elems_read_per_iteration(0), _num_elems_written_per_iteration(0)
+{
+}
+
+BorderSize NEDirectConvolutionLayerKernel::border_size() const
+{
+ return _border_size;
+}
+
+void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),
+ "Pad > 0 not supported for 1x1 weights");
+ ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),
+ "Pad > 1 not supported for 3x3 weights");
+ ARM_COMPUTE_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
+
+ const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+ const unsigned int conv_pad_x = std::get<0>(conv_info.pad());
+ const unsigned int conv_pad_y = std::get<1>(conv_info.pad());
+
+ _input = input;
+ _weights = weights;
+ _output = output;
+ _conv_info = conv_info;
+ _kernel_size = weights->info()->dimension(0);
+ _border_size = BorderSize(conv_pad_y, conv_pad_x);
+
+ Window win = calculate_max_window(*output->info());
+
+ switch(_kernel_size)
+ {
+ case 1:
+ {
+ _num_elems_written_per_iteration = (input->info()->data_type() == DataType::QS8) ? 8 : 4;
+ _num_elems_read_per_iteration = conv_stride_x * _num_elems_written_per_iteration;
+
+ win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, _num_elems_read_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ break;
+ }
+ case 3:
+ {
+ if(input->info()->data_type() == DataType::F32)
+ {
+ _num_elems_read_per_iteration = 12;
+ _num_elems_written_per_iteration = 16 >> conv_stride_x;
+ }
+ else
+ {
+ _num_elems_read_per_iteration = 24;
+ _num_elems_written_per_iteration = 32 >> conv_stride_x;
+ }
+
+ // Calculate right and bottom border
+ const unsigned int conv_stride_y = std::get<1>(_conv_info.stride());
+ const int input_width = input->info()->dimension(0);
+ const int input_height = input->info()->dimension(1);
+ const int upper_bound_w = ceil_to_multiple(((output->info()->dimension(0) - 1) * conv_stride_x + _kernel_size), _num_elems_read_per_iteration) - conv_pad_x - input_width;
+ const int upper_bound_h = ((output->info()->dimension(1) - 1) * conv_stride_y - conv_pad_y + _kernel_size) - input_height;
+ _border_size.right = std::max(upper_bound_w, static_cast<int>(_kernel_size));
+ _border_size.bottom = std::max(upper_bound_h, static_cast<int>(_kernel_size));
+
+ // Create window and update padding
+ win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
+ AccessWindowStatic input_access(input->info(), -conv_pad_x, -conv_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
+ AccessWindowStatic weights_access(weights->info(), 0, 0, _kernel_size, _kernel_size);
+ AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
+ update_window_and_padding(win, input_access, weights_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not implemented");
+ break;
+ }
+ }
+
+ INEKernel::configure(win);
+}
+
+void NEDirectConvolutionLayerKernel::run(const Window &window)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+ const int kernel_size = _weights->info()->dimension(0);
+
+ switch(kernel_size)
+ {
+ case 1:
+ {
+ if(_input->info()->data_type() == DataType::QS8)
+ {
+ convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ }
+ else
+ {
+ convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ }
+ break;
+ }
+ case 3:
+ {
+ if(_input->info()->data_type() == DataType::QS8)
+ {
+ convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ }
+ else
+ {
+ convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ }
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Only kernel sizes 1x1 and 3x3 are supported.");
+ break;
+ }
+ }
+}
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 5e03c32..bd99242 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -47,13 +47,15 @@
void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
_tensor = tensor;
_border_size = border_size;
_mode = border_mode;
_constant_border_value = constant_border_value;
+ _border_size.limit(tensor->info()->padding());
+
Window win;
win.set(Window::DimX, Window::Dimension(0, 1, 1));
win.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -81,10 +83,15 @@
case DataType::U8:
fill_constant_value_single_channel<uint8_t>(window);
break;
+ case DataType::QS8:
+ case DataType::S8:
+ fill_constant_value_single_channel<int8_t>(window);
+ break;
case DataType::U16:
fill_constant_value_single_channel<uint16_t>(window);
break;
case DataType::S16:
+ case DataType::QS16:
fill_constant_value_single_channel<int16_t>(window);
break;
case DataType::U32:
@@ -109,10 +116,15 @@
case DataType::U8:
fill_replicate_single_channel<uint8_t>(window);
break;
+ case DataType::QS8:
+ case DataType::S8:
+ fill_replicate_single_channel<int8_t>(window);
+ break;
case DataType::U16:
fill_replicate_single_channel<uint16_t>(window);
break;
case DataType::S16:
+ case DataType::QS16:
fill_replicate_single_channel<int16_t>(window);
break;
case DataType::U32:
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 9012060..3ff8b7b 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -132,8 +132,8 @@
void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4);
ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(input->info()->dimension(1) / 4.0f));
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index daeeee0..3558c68 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -35,7 +35,6 @@
#include <arm_neon.h>
#include <cstddef>
#include <cstdint>
-#include <iostream>
#include <tuple>
using namespace arm_compute;
@@ -67,7 +66,7 @@
_output_mult_int = output_mult_int;
_shift = shift;
- constexpr unsigned int num_elems_processed_per_iteration_x = 4;
+ constexpr unsigned int num_elems_processed_per_iteration_x = 16;
constexpr unsigned int num_elems_processed_per_iteration_y = 4;
Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
@@ -95,9 +94,9 @@
win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
win_a.set(Window::DimY, Window::Dimension(window.y().start() >> 2, window.y().end() >> 2, 1));
- /* Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the output matrix */
+ /* Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix */
Window win_b(window);
- win_b.set(Window::DimX, Window::Dimension(window.x().start() >> 2, window.x().end() >> 2, in_b_stride));
+ win_b.set(Window::DimX, Window::Dimension(window.x().start() >> 4, window.x().end() >> 4, in_b_stride));
win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
/* The step x and step y for the output matrix has been already set using in configure() */
@@ -105,113 +104,320 @@
Iterator inb(_input1, win_b);
Iterator out(_output, window);
- const int32x4_t voffset_a = vdupq_n_s32(_a_offset);
- const int32x4_t voffset_b = vdupq_n_s32(_b_offset);
- const int32x4_t voffset_out = vdupq_n_s32(_output_offset);
- const int32x4_t vshiftr = vdupq_n_s32(-_shift);
+ const int32x4_t voffset_a = vdupq_n_s32(_a_offset);
+ const int32x4_t voffset_b = vdupq_n_s32(_b_offset);
+ const int32x4_t vshiftr = vdupq_n_s32(-_shift);
- const int width_b = _input1->info()->dimension(0);
- const int max_it_16 = width_b - 16;
+ const int width_b = _input1->info()->dimension(0);
+ // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
+ // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
+ // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
execute_window_loop(window, [&](const Coordinates &)
{
- auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
- auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
+ const uint8_t *mtx_a0 = ina.ptr();
+ const uint8_t *mtx_b0 = inb.ptr();
- int32x4x4_t c =
+ // Accumulators for the block 0
+ int32x4x4_t c0 =
{
{
- vdupq_n_s32(0),
- vdupq_n_s32(0),
- vdupq_n_s32(0),
- vdupq_n_s32(0)
+ vdupq_n_s32(_output_offset),
+ vdupq_n_s32(_output_offset),
+ vdupq_n_s32(_output_offset),
+ vdupq_n_s32(_output_offset)
}
};
- int k = 0;
- // if max_it_16 < 0 we skip the for block and fall back to process just 4 elements
- for(; k <= max_it_16; k += 16, mtx_a0 += 16, mtx_b0 += 16)
+
+ // Accumulators for the block 1
+ int32x4x4_t c1 =
{
- const int8x16_t p00 = vld1q_s8(mtx_a0);
- const int8x16_t q00 = vld1q_s8(mtx_b0);
- const int32x4x4_t ia0 =
+ {
+ vdupq_n_s32(_output_offset),
+ vdupq_n_s32(_output_offset),
+ vdupq_n_s32(_output_offset),
+ vdupq_n_s32(_output_offset)
+ }
+ };
+
+ // Accumulators for the block 2
+ int32x4x4_t c2 =
+ {
+ {
+ vdupq_n_s32(_output_offset),
+ vdupq_n_s32(_output_offset),
+ vdupq_n_s32(_output_offset),
+ vdupq_n_s32(_output_offset)
+ }
+ };
+
+ // Accumulators for the block 3
+ int32x4x4_t c3 =
+ {
+ {
+ vdupq_n_s32(_output_offset),
+ vdupq_n_s32(_output_offset),
+ vdupq_n_s32(_output_offset),
+ vdupq_n_s32(_output_offset)
+ }
+ };
+
+ int k = 0;
+ // This for loop performs 4 accumulations per iteration
+ for(; k <= (width_b - 64); k += 64, mtx_a0 += 16, mtx_b0 += 64)
+ {
+ const uint8x8_t p00 = vld1_u8(mtx_a0 + 0);
+ const uint8x8_t p01 = vld1_u8(mtx_a0 + 8);
+ const uint8x8_t q00l = vld1_u8(mtx_b0 + 0);
+ const uint8x8_t q00h = vld1_u8(mtx_b0 + 8);
+ const uint8x8_t q01l = vld1_u8(mtx_b0 + 16);
+ const uint8x8_t q01h = vld1_u8(mtx_b0 + 24);
+ const uint8x8_t q02l = vld1_u8(mtx_b0 + 32);
+ const uint8x8_t q02h = vld1_u8(mtx_b0 + 40);
+ const uint8x8_t q03l = vld1_u8(mtx_b0 + 48);
+ const uint8x8_t q03h = vld1_u8(mtx_b0 + 56);
+
+ const int32x4_t ia0l = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p00))));
+ const int32x4_t ia0h = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(p00))));
+ const int32x4_t ia1l = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p01))));
+ const int32x4_t ia1h = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(p01))));
+
+ const int32x2x4_t ia0 =
{
{
- vaddw_s16(voffset_a, vget_low_s16(vmovl_s8(vget_low_s8(p00)))),
- vaddw_s16(voffset_a, vget_high_s16(vmovl_s8(vget_low_s8(p00)))),
- vaddw_s16(voffset_a, vget_low_s16(vmovl_s8(vget_high_s8(p00)))),
- vaddw_s16(voffset_a, vget_high_s16(vmovl_s8(vget_high_s8(p00))))
+ vget_low_s32(ia0l),
+ vget_high_s32(ia0l),
+ vget_low_s32(ia0h),
+ vget_high_s32(ia0h)
}
};
+
+ const int32x2x4_t ia1 =
+ {
+ {
+ vget_low_s32(ia1l),
+ vget_high_s32(ia1l),
+ vget_low_s32(ia1h),
+ vget_high_s32(ia1h)
+ }
+ };
+
const int32x4x4_t ib0 =
{
{
- vaddw_s16(voffset_b, vget_low_s16(vmovl_s8(vget_low_s8(q00)))),
- vaddw_s16(voffset_b, vget_high_s16(vmovl_s8(vget_low_s8(q00)))),
- vaddw_s16(voffset_b, vget_low_s16(vmovl_s8(vget_high_s8(q00)))),
- vaddw_s16(voffset_b, vget_high_s16(vmovl_s8(vget_high_s8(q00))))
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00l)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00l)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00h)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00h))))
}
};
- /* Accumulation 0 */
- c.val[0] = vmlaq_lane_s32(c.val[0], ib0.val[0], vget_low_s32(ia0.val[0]), 0);
- c.val[1] = vmlaq_lane_s32(c.val[1], ib0.val[0], vget_low_s32(ia0.val[0]), 1);
- c.val[2] = vmlaq_lane_s32(c.val[2], ib0.val[0], vget_high_s32(ia0.val[0]), 0);
- c.val[3] = vmlaq_lane_s32(c.val[3], ib0.val[0], vget_high_s32(ia0.val[0]), 1);
- /* Accumulation 1 */
- c.val[0] = vmlaq_lane_s32(c.val[0], ib0.val[1], vget_low_s32(ia0.val[1]), 0);
- c.val[1] = vmlaq_lane_s32(c.val[1], ib0.val[1], vget_low_s32(ia0.val[1]), 1);
- c.val[2] = vmlaq_lane_s32(c.val[2], ib0.val[1], vget_high_s32(ia0.val[1]), 0);
- c.val[3] = vmlaq_lane_s32(c.val[3], ib0.val[1], vget_high_s32(ia0.val[1]), 1);
- /* Accumulation 2 */
- c.val[0] = vmlaq_lane_s32(c.val[0], ib0.val[2], vget_low_s32(ia0.val[2]), 0);
- c.val[1] = vmlaq_lane_s32(c.val[1], ib0.val[2], vget_low_s32(ia0.val[2]), 1);
- c.val[2] = vmlaq_lane_s32(c.val[2], ib0.val[2], vget_high_s32(ia0.val[2]), 0);
- c.val[3] = vmlaq_lane_s32(c.val[3], ib0.val[2], vget_high_s32(ia0.val[2]), 1);
- /* Accumulation 3 */
- c.val[0] = vmlaq_lane_s32(c.val[0], ib0.val[3], vget_low_s32(ia0.val[3]), 0);
- c.val[1] = vmlaq_lane_s32(c.val[1], ib0.val[3], vget_low_s32(ia0.val[3]), 1);
- c.val[2] = vmlaq_lane_s32(c.val[2], ib0.val[3], vget_high_s32(ia0.val[3]), 0);
- c.val[3] = vmlaq_lane_s32(c.val[3], ib0.val[3], vget_high_s32(ia0.val[3]), 1);
- }
- for(; k < width_b; k += 4, mtx_a0 += 4, mtx_b0 += 4)
- {
- const int8x8_t p00 = vld1_s8(mtx_a0);
- const int8x8_t q00 = vld1_s8(mtx_b0);
- const int32x4_t ia0 = vaddw_s16(voffset_a, vget_low_s16(vmovl_s8(p00)));
- const int32x4_t ib0 = vaddw_s16(voffset_b, vget_low_s16(vmovl_s8(q00)));
- c.val[0] = vmlaq_lane_s32(c.val[0], ib0, vget_low_s32(ia0), 0);
- c.val[1] = vmlaq_lane_s32(c.val[1], ib0, vget_low_s32(ia0), 1);
- c.val[2] = vmlaq_lane_s32(c.val[2], ib0, vget_high_s32(ia0), 0);
- c.val[3] = vmlaq_lane_s32(c.val[3], ib0, vget_high_s32(ia0), 1);
+ const int32x4x4_t ib1 =
+ {
+ {
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q01l)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q01l)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q01h)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q01h))))
+ }
+ };
+
+ const int32x4x4_t ib2 =
+ {
+ {
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q02l)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q02l)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q02h)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q02h))))
+ }
+ };
+
+ const int32x4x4_t ib3 =
+ {
+ {
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q03l)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q03l)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q03h)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q03h))))
+ }
+ };
+
+ // 4x4 block 0 - Accumulation 0
+ c0.val[0] = vmlaq_lane_s32(c0.val[0], ib0.val[0], ia0.val[0], 0);
+ c0.val[1] = vmlaq_lane_s32(c0.val[1], ib0.val[0], ia0.val[0], 1);
+ c0.val[2] = vmlaq_lane_s32(c0.val[2], ib0.val[0], ia0.val[1], 0);
+ c0.val[3] = vmlaq_lane_s32(c0.val[3], ib0.val[0], ia0.val[1], 1);
+ // 4x4 block 0 - Accumulation 1
+ c0.val[0] = vmlaq_lane_s32(c0.val[0], ib1.val[0], ia0.val[2], 0);
+ c0.val[1] = vmlaq_lane_s32(c0.val[1], ib1.val[0], ia0.val[2], 1);
+ c0.val[2] = vmlaq_lane_s32(c0.val[2], ib1.val[0], ia0.val[3], 0);
+ c0.val[3] = vmlaq_lane_s32(c0.val[3], ib1.val[0], ia0.val[3], 1);
+ // 4x4 block 0 - Accumulation 2
+ c0.val[0] = vmlaq_lane_s32(c0.val[0], ib2.val[0], ia1.val[0], 0);
+ c0.val[1] = vmlaq_lane_s32(c0.val[1], ib2.val[0], ia1.val[0], 1);
+ c0.val[2] = vmlaq_lane_s32(c0.val[2], ib2.val[0], ia1.val[1], 0);
+ c0.val[3] = vmlaq_lane_s32(c0.val[3], ib2.val[0], ia1.val[1], 1);
+ // 4x4 block 0 - Accumulation 3
+ c0.val[0] = vmlaq_lane_s32(c0.val[0], ib3.val[0], ia1.val[2], 0);
+ c0.val[1] = vmlaq_lane_s32(c0.val[1], ib3.val[0], ia1.val[2], 1);
+ c0.val[2] = vmlaq_lane_s32(c0.val[2], ib3.val[0], ia1.val[3], 0);
+ c0.val[3] = vmlaq_lane_s32(c0.val[3], ib3.val[0], ia1.val[3], 1);
+
+ // 4x4 block 1 - Accumulation 0
+ c1.val[0] = vmlaq_lane_s32(c1.val[0], ib0.val[1], ia0.val[0], 0);
+ c1.val[1] = vmlaq_lane_s32(c1.val[1], ib0.val[1], ia0.val[0], 1);
+ c1.val[2] = vmlaq_lane_s32(c1.val[2], ib0.val[1], ia0.val[1], 0);
+ c1.val[3] = vmlaq_lane_s32(c1.val[3], ib0.val[1], ia0.val[1], 1);
+ // 4x4 block 1 - Accumulation 1
+ c1.val[0] = vmlaq_lane_s32(c1.val[0], ib1.val[1], ia0.val[2], 0);
+ c1.val[1] = vmlaq_lane_s32(c1.val[1], ib1.val[1], ia0.val[2], 1);
+ c1.val[2] = vmlaq_lane_s32(c1.val[2], ib1.val[1], ia0.val[3], 0);
+ c1.val[3] = vmlaq_lane_s32(c1.val[3], ib1.val[1], ia0.val[3], 1);
+ // 4x4 block 1 - Accumulation 2
+ c1.val[0] = vmlaq_lane_s32(c1.val[0], ib2.val[1], ia1.val[0], 0);
+ c1.val[1] = vmlaq_lane_s32(c1.val[1], ib2.val[1], ia1.val[0], 1);
+ c1.val[2] = vmlaq_lane_s32(c1.val[2], ib2.val[1], ia1.val[1], 0);
+ c1.val[3] = vmlaq_lane_s32(c1.val[3], ib2.val[1], ia1.val[1], 1);
+ // 4x4 block 1 - Accumulation 3
+ c1.val[0] = vmlaq_lane_s32(c1.val[0], ib3.val[1], ia1.val[2], 0);
+ c1.val[1] = vmlaq_lane_s32(c1.val[1], ib3.val[1], ia1.val[2], 1);
+ c1.val[2] = vmlaq_lane_s32(c1.val[2], ib3.val[1], ia1.val[3], 0);
+ c1.val[3] = vmlaq_lane_s32(c1.val[3], ib3.val[1], ia1.val[3], 1);
+
+ // 4x4 block 2 - Accumulation 0
+ c2.val[0] = vmlaq_lane_s32(c2.val[0], ib0.val[2], ia0.val[0], 0);
+ c2.val[1] = vmlaq_lane_s32(c2.val[1], ib0.val[2], ia0.val[0], 1);
+ c2.val[2] = vmlaq_lane_s32(c2.val[2], ib0.val[2], ia0.val[1], 0);
+ c2.val[3] = vmlaq_lane_s32(c2.val[3], ib0.val[2], ia0.val[1], 1);
+ // 4x4 block 2 - Accumulation 1
+ c2.val[0] = vmlaq_lane_s32(c2.val[0], ib1.val[2], ia0.val[2], 0);
+ c2.val[1] = vmlaq_lane_s32(c2.val[1], ib1.val[2], ia0.val[2], 1);
+ c2.val[2] = vmlaq_lane_s32(c2.val[2], ib1.val[2], ia0.val[3], 0);
+ c2.val[3] = vmlaq_lane_s32(c2.val[3], ib1.val[2], ia0.val[3], 1);
+ // 4x4 block 2 - Accumulation 2
+ c2.val[0] = vmlaq_lane_s32(c2.val[0], ib2.val[2], ia1.val[0], 0);
+ c2.val[1] = vmlaq_lane_s32(c2.val[1], ib2.val[2], ia1.val[0], 1);
+ c2.val[2] = vmlaq_lane_s32(c2.val[2], ib2.val[2], ia1.val[1], 0);
+ c2.val[3] = vmlaq_lane_s32(c2.val[3], ib2.val[2], ia1.val[1], 1);
+ // 4x4 block 2 - Accumulation 3
+ c2.val[0] = vmlaq_lane_s32(c2.val[0], ib3.val[2], ia1.val[2], 0);
+ c2.val[1] = vmlaq_lane_s32(c2.val[1], ib3.val[2], ia1.val[2], 1);
+ c2.val[2] = vmlaq_lane_s32(c2.val[2], ib3.val[2], ia1.val[3], 0);
+ c2.val[3] = vmlaq_lane_s32(c2.val[3], ib3.val[2], ia1.val[3], 1);
+
+ // 4x4 block 3 - Accumulation 0
+ c3.val[0] = vmlaq_lane_s32(c3.val[0], ib0.val[3], ia0.val[0], 0);
+ c3.val[1] = vmlaq_lane_s32(c3.val[1], ib0.val[3], ia0.val[0], 1);
+ c3.val[2] = vmlaq_lane_s32(c3.val[2], ib0.val[3], ia0.val[1], 0);
+ c3.val[3] = vmlaq_lane_s32(c3.val[3], ib0.val[3], ia0.val[1], 1);
+ // 4x4 block 3 - Accumulation 1
+ c3.val[0] = vmlaq_lane_s32(c3.val[0], ib1.val[3], ia0.val[2], 0);
+ c3.val[1] = vmlaq_lane_s32(c3.val[1], ib1.val[3], ia0.val[2], 1);
+ c3.val[2] = vmlaq_lane_s32(c3.val[2], ib1.val[3], ia0.val[3], 0);
+ c3.val[3] = vmlaq_lane_s32(c3.val[3], ib1.val[3], ia0.val[3], 1);
+ // 4x4 block 3 - Accumulation 2
+ c3.val[0] = vmlaq_lane_s32(c3.val[0], ib2.val[3], ia1.val[0], 0);
+ c3.val[1] = vmlaq_lane_s32(c3.val[1], ib2.val[3], ia1.val[0], 1);
+ c3.val[2] = vmlaq_lane_s32(c3.val[2], ib2.val[3], ia1.val[1], 0);
+ c3.val[3] = vmlaq_lane_s32(c3.val[3], ib2.val[3], ia1.val[1], 1);
+ // 4x4 block 3 - Accumulation 3
+ c3.val[0] = vmlaq_lane_s32(c3.val[0], ib3.val[3], ia1.val[2], 0);
+ c3.val[1] = vmlaq_lane_s32(c3.val[1], ib3.val[3], ia1.val[2], 1);
+ c3.val[2] = vmlaq_lane_s32(c3.val[2], ib3.val[3], ia1.val[3], 0);
+ c3.val[3] = vmlaq_lane_s32(c3.val[3], ib3.val[3], ia1.val[3], 1);
}
- c.val[0] = vshlq_s32(vmulq_n_s32(vaddq_s32(voffset_out, c.val[0]), _output_mult_int), vshiftr);
- c.val[1] = vshlq_s32(vmulq_n_s32(vaddq_s32(voffset_out, c.val[1]), _output_mult_int), vshiftr);
- c.val[2] = vshlq_s32(vmulq_n_s32(vaddq_s32(voffset_out, c.val[2]), _output_mult_int), vshiftr);
- c.val[3] = vshlq_s32(vmulq_n_s32(vaddq_s32(voffset_out, c.val[3]), _output_mult_int), vshiftr);
- const uint8x8x2_t r =
+
+ // This for loop handles the left-over accumulations
+ for(; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+ {
+ const uint8x8_t p00 = vld1_u8(mtx_a0);
+ const uint8x8_t q00l = vld1_u8(mtx_b0);
+ const uint8x8_t q00h = vld1_u8(mtx_b0 + 8);
+
+ const int32x4_t ia0 = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p00))));
+
+ const int32x2x2_t ia =
+ {
+ {
+ vget_low_s32(ia0),
+ vget_high_s32(ia0)
+ }
+ };
+
+ const int32x4x4_t ib0 =
+ {
+ {
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00l)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00l)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00h)))),
+ vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00h))))
+ }
+ };
+
+ // 4x4 block 0
+ c0.val[0] = vmlaq_lane_s32(c0.val[0], ib0.val[0], ia.val[0], 0);
+ c0.val[1] = vmlaq_lane_s32(c0.val[1], ib0.val[0], ia.val[0], 1);
+ c0.val[2] = vmlaq_lane_s32(c0.val[2], ib0.val[0], ia.val[1], 0);
+ c0.val[3] = vmlaq_lane_s32(c0.val[3], ib0.val[0], ia.val[1], 1);
+
+ // 4x4 block 1
+ c1.val[0] = vmlaq_lane_s32(c1.val[0], ib0.val[1], ia.val[0], 0);
+ c1.val[1] = vmlaq_lane_s32(c1.val[1], ib0.val[1], ia.val[0], 1);
+ c1.val[2] = vmlaq_lane_s32(c1.val[2], ib0.val[1], ia.val[1], 0);
+ c1.val[3] = vmlaq_lane_s32(c1.val[3], ib0.val[1], ia.val[1], 1);
+
+ // 4x4 block 2
+ c2.val[0] = vmlaq_lane_s32(c2.val[0], ib0.val[2], ia.val[0], 0);
+ c2.val[1] = vmlaq_lane_s32(c2.val[1], ib0.val[2], ia.val[0], 1);
+ c2.val[2] = vmlaq_lane_s32(c2.val[2], ib0.val[2], ia.val[1], 0);
+ c2.val[3] = vmlaq_lane_s32(c2.val[3], ib0.val[2], ia.val[1], 1);
+
+ // 4x4 block 3
+ c3.val[0] = vmlaq_lane_s32(c3.val[0], ib0.val[3], ia.val[0], 0);
+ c3.val[1] = vmlaq_lane_s32(c3.val[1], ib0.val[3], ia.val[0], 1);
+ c3.val[2] = vmlaq_lane_s32(c3.val[2], ib0.val[3], ia.val[1], 0);
+ c3.val[3] = vmlaq_lane_s32(c3.val[3], ib0.val[3], ia.val[1], 1);
+ }
+
+ c0.val[0] = vshlq_s32(vmulq_n_s32(c0.val[0], _output_mult_int), vshiftr);
+ c0.val[1] = vshlq_s32(vmulq_n_s32(c0.val[1], _output_mult_int), vshiftr);
+ c0.val[2] = vshlq_s32(vmulq_n_s32(c0.val[2], _output_mult_int), vshiftr);
+ c0.val[3] = vshlq_s32(vmulq_n_s32(c0.val[3], _output_mult_int), vshiftr);
+
+ c1.val[0] = vshlq_s32(vmulq_n_s32(c1.val[0], _output_mult_int), vshiftr);
+ c1.val[1] = vshlq_s32(vmulq_n_s32(c1.val[1], _output_mult_int), vshiftr);
+ c1.val[2] = vshlq_s32(vmulq_n_s32(c1.val[2], _output_mult_int), vshiftr);
+ c1.val[3] = vshlq_s32(vmulq_n_s32(c1.val[3], _output_mult_int), vshiftr);
+
+ c2.val[0] = vshlq_s32(vmulq_n_s32(c2.val[0], _output_mult_int), vshiftr);
+ c2.val[1] = vshlq_s32(vmulq_n_s32(c2.val[1], _output_mult_int), vshiftr);
+ c2.val[2] = vshlq_s32(vmulq_n_s32(c2.val[2], _output_mult_int), vshiftr);
+ c2.val[3] = vshlq_s32(vmulq_n_s32(c2.val[3], _output_mult_int), vshiftr);
+
+ c3.val[0] = vshlq_s32(vmulq_n_s32(c3.val[0], _output_mult_int), vshiftr);
+ c3.val[1] = vshlq_s32(vmulq_n_s32(c3.val[1], _output_mult_int), vshiftr);
+ c3.val[2] = vshlq_s32(vmulq_n_s32(c3.val[2], _output_mult_int), vshiftr);
+ c3.val[3] = vshlq_s32(vmulq_n_s32(c3.val[3], _output_mult_int), vshiftr);
+
+ const uint8x16x4_t r =
{
{
- vqmovun_s16(vcombine_s16(vqmovn_s32(c.val[0]), vqmovn_s32(c.val[1]))),
- vqmovun_s16(vcombine_s16(vqmovn_s32(c.val[2]), vqmovn_s32(c.val[3])))
+ vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[0]), vqmovn_s32(c1.val[0]))),
+ vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[0]), vqmovn_s32(c3.val[0])))),
+ vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[1]), vqmovn_s32(c1.val[1]))),
+ vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[1]), vqmovn_s32(c3.val[1])))),
+ vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[2]), vqmovn_s32(c1.val[2]))),
+ vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[2]), vqmovn_s32(c3.val[2])))),
+ vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[3]), vqmovn_s32(c1.val[3]))),
+ vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[3]), vqmovn_s32(c3.val[3]))))
}
};
- const auto mtx_out = reinterpret_cast<uint8_t *>(out.ptr());
- vst1_lane_u8(mtx_out + 0 * out_stride + 0, r.val[0], 0);
- vst1_lane_u8(mtx_out + 0 * out_stride + 1, r.val[0], 1);
- vst1_lane_u8(mtx_out + 0 * out_stride + 2, r.val[0], 2);
- vst1_lane_u8(mtx_out + 0 * out_stride + 3, r.val[0], 3);
- vst1_lane_u8(mtx_out + 1 * out_stride + 0, r.val[0], 4);
- vst1_lane_u8(mtx_out + 1 * out_stride + 1, r.val[0], 5);
- vst1_lane_u8(mtx_out + 1 * out_stride + 2, r.val[0], 6);
- vst1_lane_u8(mtx_out + 1 * out_stride + 3, r.val[0], 7);
- vst1_lane_u8(mtx_out + 2 * out_stride + 0, r.val[1], 0);
- vst1_lane_u8(mtx_out + 2 * out_stride + 1, r.val[1], 1);
- vst1_lane_u8(mtx_out + 2 * out_stride + 2, r.val[1], 2);
- vst1_lane_u8(mtx_out + 2 * out_stride + 3, r.val[1], 3);
- vst1_lane_u8(mtx_out + 3 * out_stride + 0, r.val[1], 4);
- vst1_lane_u8(mtx_out + 3 * out_stride + 1, r.val[1], 5);
- vst1_lane_u8(mtx_out + 3 * out_stride + 2, r.val[1], 6);
- vst1_lane_u8(mtx_out + 3 * out_stride + 3, r.val[1], 7);
+
+ uint8_t *const mtx_out = out.ptr();
+ vst1q_u8(mtx_out + 0 * out_stride, r.val[0]);
+ vst1q_u8(mtx_out + 1 * out_stride, r.val[1]);
+ vst1q_u8(mtx_out + 2 * out_stride, r.val[2]);
+ vst1q_u8(mtx_out + 3 * out_stride, r.val[3]);
},
ina, inb, out);
}
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
index 70018ec..7a3bae5 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
@@ -44,26 +45,31 @@
void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
_biases = biases;
_accum = accum;
- constexpr unsigned int num_elems_processed_per_iteration = 4;
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
// Configure kernel window
Window win = calculate_max_window(*accum->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowStatic output_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1));
+ AccessWindowStatic biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1));
update_window_and_padding(win,
AccessWindowHorizontal(accum->info(), 0, num_elems_processed_per_iteration),
- output_access);
+ biases_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), accum->info()->tensor_shape()));
+ AccessWindowHorizontal output_access(accum->info(), 0, num_elems_processed_per_iteration);
+
+ // Set the valid region for the accum tensor
+ Coordinates coord;
+ coord.set_num_dimensions(accum->info()->num_dimensions());
+ output_access.set_valid_region(win, ValidRegion(coord, accum->info()->tensor_shape()));
INEKernel::configure(win);
}
@@ -80,12 +86,43 @@
Iterator in0_out(_accum, window);
Iterator in1(_biases, win_biases);
- execute_window_loop(window, [&](const Coordinates & id)
+ switch(_accum->info()->data_type())
{
- const float32x4_t accum = vld1q_f32(reinterpret_cast<const float *>(in0_out.ptr()));
- const float32x4_t biases = vld1q_f32(reinterpret_cast<const float *>(in1.ptr()));
+ case DataType::F32:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float32x4x4_t accum = vld4q_f32(reinterpret_cast<const float *>(in0_out.ptr()));
+ const float32x4x4_t biases = vld4q_f32(reinterpret_cast<const float *>(in1.ptr()));
+ const float32x4x4_t res =
+ {
+ {
+ vaddq_f32(accum.val[0], biases.val[0]),
+ vaddq_f32(accum.val[1], biases.val[1]),
+ vaddq_f32(accum.val[2], biases.val[2]),
+ vaddq_f32(accum.val[3], biases.val[3])
+ }
+ };
- vst1q_f32(reinterpret_cast<float *>(in0_out.ptr()), vaddq_f32(accum, biases));
- },
- in0_out, in1);
+ vst4q_f32(reinterpret_cast<float *>(in0_out.ptr()), res);
+ },
+ in0_out, in1);
+ break;
+ }
+ case DataType::QS8:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const qint8x16_t accum = vld1q_qs8(reinterpret_cast<const qint8_t *>(in0_out.ptr()));
+ const qint8x16_t biases = vld1q_qs8(reinterpret_cast<const qint8_t *>(in1.ptr()));
+
+ vst1q_qs8(reinterpret_cast<qint8_t *>(in0_out.ptr()), vqaddq_qs8(accum, biases));
+ },
+ in0_out, in1);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
}
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index c6460b1..71dd4c7 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
@@ -37,19 +38,151 @@
class Coordinates;
} // namespace arm_compute
+namespace
+{
+void matrix_addition_f32(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+ const float32x4_t beta_f32 = vdupq_n_f32(beta);
+
+ Iterator in(input, window);
+ Iterator out(output, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const float *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<float *>(out.ptr());
+
+ float32x4x4_t alpha_ab =
+ {
+ {
+ vld1q_f32(out_ptr + 0),
+ vld1q_f32(out_ptr + 4),
+ vld1q_f32(out_ptr + 8),
+ vld1q_f32(out_ptr + 12)
+ }
+ };
+
+ const float32x4x4_t c =
+ {
+ {
+ vld1q_f32(in_ptr + 0),
+ vld1q_f32(in_ptr + 4),
+ vld1q_f32(in_ptr + 8),
+ vld1q_f32(in_ptr + 12)
+ }
+ };
+
+ // Multiply matrix C by its weight and accumulate
+ alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
+ alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
+ alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
+ alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
+
+ vst1q_f32(out_ptr + 0, alpha_ab.val[0]);
+ vst1q_f32(out_ptr + 4, alpha_ab.val[1]);
+ vst1q_f32(out_ptr + 8, alpha_ab.val[2]);
+ vst1q_f32(out_ptr + 12, alpha_ab.val[3]);
+ },
+ in, out);
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+void matrix_addition_f16(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+ const float16x8_t beta_f16 = vdupq_n_f16(beta);
+
+ Iterator in(input, window);
+ Iterator out(output, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const float16_t *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
+
+ float16x8x2_t alpha_ab =
+ {
+ {
+ vld1q_f16(out_ptr + 0),
+ vld1q_f16(out_ptr + 8)
+ }
+ };
+
+ float16x8x2_t c =
+ {
+ {
+ vld1q_f16(in_ptr + 0),
+ vld1q_f16(in_ptr + 8)
+ }
+ };
+
+ // Multiply matrix C by its weight and accumulate
+ alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
+ alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
+
+ vst1q_f16(out_ptr + 0, alpha_ab.val[0]);
+ vst1q_f16(out_ptr + 8, alpha_ab.val[1]);
+ },
+ in, out);
+}
+#endif
+
+void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+ const int fixed_point_position = input->info()->fixed_point_position();
+ const qint8x16_t beta_qs8 = vdupq_n_qs8(scvt_qs8_f32(beta, fixed_point_position));
+
+ Iterator in(input, window);
+ Iterator out(output, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const qint8_t *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<qint8_t *>(out.ptr());
+
+ qint8x16_t alpha_ab = vld1q_qs8(out_ptr);
+ const qint8x16_t c = vld1q_qs8(in_ptr);
+
+ // Multiply matrix C by its weight and accumulate
+ alpha_ab = vqmlaq_qs8(alpha_ab, c, beta_qs8, fixed_point_position);
+
+ vst1q_qs8(out_ptr, alpha_ab);
+ },
+ in, out);
+}
+} // namespace
+
NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel()
- : INESimpleKernel(), _beta(0.0f)
+ : INESimpleKernel(), _func(nullptr), _beta(0.0f)
{
}
-void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, const float beta)
+void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+ switch(input->info()->data_type())
+ {
+ case DataType::F32:
+ _func = &matrix_addition_f32;
+ break;
+ case DataType::QS8:
+ _func = &matrix_addition_qs8;
+ break;
+ case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ _func = &matrix_addition_f16;
+ break;
+#endif
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
+
constexpr unsigned int num_elems_processed_per_iteration = 16;
INESimpleKernel::configure(input, output, num_elems_processed_per_iteration);
@@ -62,103 +195,8 @@
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
- if(0.0f != _beta)
+ if(_beta != 0.0f)
{
- switch(_input->info()->data_type())
- {
- case DataType::F32:
- {
- const float32x4_t beta_f32 = vdupq_n_f32(_beta);
-
- Iterator in(_input, window);
- Iterator out(_output, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto in_ptr = reinterpret_cast<const float *>(in.ptr());
- const auto out_ptr = reinterpret_cast<float *>(out.ptr());
-
- float32x4x4_t alpha_ab =
- {
- {
- vld1q_f32(out_ptr + 0),
- vld1q_f32(out_ptr + 4),
- vld1q_f32(out_ptr + 8),
- vld1q_f32(out_ptr + 12)
- }
- };
-
- const float32x4x4_t c =
- {
- {
- vld1q_f32(in_ptr + 0),
- vld1q_f32(in_ptr + 4),
- vld1q_f32(in_ptr + 8),
- vld1q_f32(in_ptr + 12)
- }
- };
-
- /* Multiply matrix C by its weight and accumulate */
- alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
- alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
- alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
- alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
-
- vst1q_f32(out_ptr + 0, alpha_ab.val[0]);
- vst1q_f32(out_ptr + 4, alpha_ab.val[1]);
- vst1q_f32(out_ptr + 8, alpha_ab.val[2]);
- vst1q_f32(out_ptr + 12, alpha_ab.val[3]);
- },
- in, out);
-
- break;
- }
- case DataType::F16:
-#ifdef ARM_COMPUTE_ENABLE_FP16
- {
- const float16x8_t beta_f16 = vdupq_n_f16(_beta);
-
- Iterator in(_input, window);
- Iterator out(_output, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto in_ptr = reinterpret_cast<const float16_t *>(in.ptr());
- const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
-
- float16x8x2_t alpha_ab =
- {
- {
- vld1q_f16(out_ptr + 0),
- vld1q_f16(out_ptr + 8)
- }
- };
-
- float16x8x2_t c =
- {
- {
- vld1q_f16(in_ptr + 0),
- vld1q_f16(in_ptr + 8)
- }
- };
-
- /* Multiply matrix C by its weight and accumulate */
- alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
- alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
-
- vst1q_f16(out_ptr + 0, alpha_ab.val[0]);
- vst1q_f16(out_ptr + 8, alpha_ab.val[1]);
- },
- in, out);
-
- break;
- }
-#endif
- default:
- {
- ARM_COMPUTE_ERROR("Data type not supported");
- break;
- }
- }
+ (*_func)(_input, _output, window, _beta);
}
}
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index 53f4eed..dcfbb13 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -28,6 +28,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
@@ -55,10 +56,11 @@
const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
+ // The implementation computes 16 elements per iteration
const int window_start_x = 16 * window.thread_id();
const int window_step_x = 16 * window.num_threads();
// Make sure (window_end_x - window_start_x) is a multiple of window_step_x
- const int window_end_x = width_matrix_b + (window_step_x - ((width_matrix_b - window_start_x) % window_step_x)) % window_step_x;
+ const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
Window win_out(window);
win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
@@ -82,8 +84,6 @@
Iterator inb(input1, win_b);
Iterator out(output, win_out);
- const auto vec_a = reinterpret_cast<const float *>(ina.ptr());
-
execute_window_loop(win_out, [&](const Coordinates & id)
{
if(id.x() > width_matrix_b)
@@ -96,33 +96,37 @@
float32x4_t acc2 = vdupq_n_f32(0.f);
float32x4_t acc3 = vdupq_n_f32(0.f);
- const auto matrix_b = reinterpret_cast<const float *>(inb.ptr());
+ auto vec_a = reinterpret_cast<const float *>(ina.ptr());
+ auto matrix_b = reinterpret_cast<const float *>(inb.ptr());
- int i = 0;
- for(; i <= (num_elems_vec_a - 4); i += 4)
+#if __arm__
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+#endif
+
+ auto vec_a_end_addr = vec_a + num_elems_vec_a;
+ for(; vec_a <= (vec_a_end_addr - 4);)
{
- const float32x2_t a0l = vld1_f32(&vec_a[i]);
- const float32x2_t a0h = vld1_f32(&vec_a[i] + 2);
+ float32x2_t a0l = vld1_f32(vec_a);
- const float32x4_t b00 = vld1q_f32(&matrix_b[0 + (i + 0) * in_b_stride]);
- const float32x4_t b01 = vld1q_f32(&matrix_b[4 + (i + 0) * in_b_stride]);
- const float32x4_t b02 = vld1q_f32(&matrix_b[8 + (i + 0) * in_b_stride]);
- const float32x4_t b03 = vld1q_f32(&matrix_b[12 + (i + 0) * in_b_stride]);
+ float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+ float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+ float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+ float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
- const float32x4_t b10 = vld1q_f32(&matrix_b[0 + (i + 1) * in_b_stride]);
- const float32x4_t b11 = vld1q_f32(&matrix_b[4 + (i + 1) * in_b_stride]);
- const float32x4_t b12 = vld1q_f32(&matrix_b[8 + (i + 1) * in_b_stride]);
- const float32x4_t b13 = vld1q_f32(&matrix_b[12 + (i + 1) * in_b_stride]);
+ float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+ float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+ float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+ float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
- const float32x4_t b20 = vld1q_f32(&matrix_b[0 + (i + 2) * in_b_stride]);
- const float32x4_t b21 = vld1q_f32(&matrix_b[4 + (i + 2) * in_b_stride]);
- const float32x4_t b22 = vld1q_f32(&matrix_b[8 + (i + 2) * in_b_stride]);
- const float32x4_t b23 = vld1q_f32(&matrix_b[12 + (i + 2) * in_b_stride]);
-
- const float32x4_t b30 = vld1q_f32(&matrix_b[0 + (i + 3) * in_b_stride]);
- const float32x4_t b31 = vld1q_f32(&matrix_b[4 + (i + 3) * in_b_stride]);
- const float32x4_t b32 = vld1q_f32(&matrix_b[8 + (i + 3) * in_b_stride]);
- const float32x4_t b33 = vld1q_f32(&matrix_b[12 + (i + 3) * in_b_stride]);
+#if __arm__
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+ asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+ asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+ asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+ asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+#endif
acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
@@ -134,30 +138,51 @@
acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
- acc0 = vmlaq_lane_f32(acc0, b20, a0h, 0);
- acc1 = vmlaq_lane_f32(acc1, b21, a0h, 0);
- acc2 = vmlaq_lane_f32(acc2, b22, a0h, 0);
- acc3 = vmlaq_lane_f32(acc3, b23, a0h, 0);
+ vec_a += 2;
+ matrix_b += 2 * in_b_stride;
- acc0 = vmlaq_lane_f32(acc0, b30, a0h, 1);
- acc1 = vmlaq_lane_f32(acc1, b31, a0h, 1);
- acc2 = vmlaq_lane_f32(acc2, b32, a0h, 1);
- acc3 = vmlaq_lane_f32(acc3, b33, a0h, 1);
+ a0l = vld1_f32(vec_a);
+
+ b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+ b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+ b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+ b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+ b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+ b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+ b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+ b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+ acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+ acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+ acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+ acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+ acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+ acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+ acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+ acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+ vec_a += 2;
+ matrix_b += 2 * in_b_stride;
}
- for(; i < num_elems_vec_a; i++)
+ for(; vec_a < vec_a_end_addr;)
{
- const float a0 = vec_a[i];
+ const float a0 = *vec_a;
- const float32x4_t b00 = vld1q_f32(&matrix_b[0 + i * in_b_stride]);
- const float32x4_t b01 = vld1q_f32(&matrix_b[4 + i * in_b_stride]);
- const float32x4_t b02 = vld1q_f32(&matrix_b[8 + i * in_b_stride]);
- const float32x4_t b03 = vld1q_f32(&matrix_b[12 + i * in_b_stride]);
+ const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+ const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+ const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+ const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
acc0 = vmlaq_n_f32(acc0, b00, a0);
acc1 = vmlaq_n_f32(acc1, b01, a0);
acc2 = vmlaq_n_f32(acc2, b02, a0);
acc3 = vmlaq_n_f32(acc3, b03, a0);
+
+ vec_a += 1;
+ matrix_b += in_b_stride;
}
// Multiply by the weight of matrix product (alpha)
@@ -172,12 +197,140 @@
const auto vec_out = reinterpret_cast<float *>(out.ptr());
- vst1q_f32(&vec_out[0], acc0);
- vst1q_f32(&vec_out[4], acc1);
- vst1q_f32(&vec_out[8], acc2);
- vst1q_f32(&vec_out[12], acc3);
+ vst1q_f32(vec_out + 0, acc0);
+ vst1q_f32(vec_out + 4, acc1);
+ vst1q_f32(vec_out + 8, acc2);
+ vst1q_f32(vec_out + 12, acc3);
},
- inb, out);
+ ina, inb, out);
+}
+
+template <bool multiply_alpha>
+void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+ const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));
+ const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+ const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
+ const int fixed_point_position = input0->info()->fixed_point_position();
+
+ // The implementation computes 32 elements per iteration
+ const int window_start_x = 32 * window.thread_id();
+ const int window_step_x = 32 * window.num_threads();
+ // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+ const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+ Window win_out(window);
+ win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+ win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ Window win_a(window);
+ win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ Window win_b;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+ if(input1->info()->num_dimensions() >= 3)
+ {
+ win_b = window;
+ }
+ win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+ win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ Iterator ina(input0, win_a);
+ Iterator inb(input1, win_b);
+ Iterator out(output, win_out);
+
+ execute_window_loop(win_out, [&](const Coordinates & id)
+ {
+ if(id.x() > width_matrix_b)
+ {
+ return;
+ }
+
+ // Reset accumulators
+ qint16x8_t acc00_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc01_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc02_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc03_qs16 = vdupq_n_qs16(0);
+
+ auto vec_a = reinterpret_cast<const qint8_t *>(ina.ptr());
+ auto matrix_b = reinterpret_cast<const qint8_t *>(inb.ptr());
+
+ auto vec_a_end_addr = vec_a + num_elems_vec_a;
+ for(; vec_a <= (vec_a_end_addr - 2);)
+ {
+ const qint8x8_t a0 = vld1_dup_qs8(vec_a + 0);
+ const qint8x8_t a1 = vld1_dup_qs8(vec_a + 1);
+
+ const qint8x8_t b00 = vld1_qs8(matrix_b + 0 + 0 * in_b_stride);
+ const qint8x8_t b01 = vld1_qs8(matrix_b + 8 + 0 * in_b_stride);
+ const qint8x8_t b02 = vld1_qs8(matrix_b + 16 + 0 * in_b_stride);
+ const qint8x8_t b03 = vld1_qs8(matrix_b + 24 + 0 * in_b_stride);
+ const qint8x8_t b10 = vld1_qs8(matrix_b + 0 + 1 * in_b_stride);
+ const qint8x8_t b11 = vld1_qs8(matrix_b + 8 + 1 * in_b_stride);
+ const qint8x8_t b12 = vld1_qs8(matrix_b + 16 + 1 * in_b_stride);
+ const qint8x8_t b13 = vld1_qs8(matrix_b + 24 + 1 * in_b_stride);
+
+ // First accumulation
+ acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+ acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+ acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position);
+ acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position);
+
+ // Second accumulation
+ acc00_qs16 = vqmlal_qs8(acc00_qs16, b10, a1, fixed_point_position);
+ acc01_qs16 = vqmlal_qs8(acc01_qs16, b11, a1, fixed_point_position);
+ acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a1, fixed_point_position);
+ acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a1, fixed_point_position);
+
+ vec_a += 2;
+ matrix_b += 2 * in_b_stride;
+ }
+
+ for(; vec_a < vec_a_end_addr;)
+ {
+ const qint8x8_t a0 = vld1_dup_qs8(vec_a);
+
+ const qint8x8_t b00 = vld1_qs8(matrix_b + 0);
+ const qint8x8_t b01 = vld1_qs8(matrix_b + 8);
+ const qint8x8_t b02 = vld1_qs8(matrix_b + 16);
+ const qint8x8_t b03 = vld1_qs8(matrix_b + 24);
+
+ acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+ acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+ acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position);
+ acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position);
+
+ vec_a += 1;
+ matrix_b += in_b_stride;
+ }
+
+ // Convert back to qint8x8_t and saturate
+ qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16);
+ qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16);
+ qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16);
+ qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16);
+
+ // Multiply by the weight of the matrix product (alpha)
+ if(multiply_alpha)
+ {
+ const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+ acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
+ acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
+ acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
+ acc03_qs8 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position);
+ }
+
+ const auto mtx_out0 = reinterpret_cast<qint8_t *>(out.ptr());
+
+ // Store 8x4 output elements
+ vst1_qs8(mtx_out0 + 0, acc00_qs8);
+ vst1_qs8(mtx_out0 + 8, acc01_qs8);
+ vst1_qs8(mtx_out0 + 16, acc02_qs8);
+ vst1_qs8(mtx_out0 + 24, acc03_qs8);
+ },
+ ina, inb, out);
}
template <bool multiply_alpha>
@@ -202,9 +355,9 @@
win_b = window;
}
// Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the output matrix
- // The step along the x direction is 4 times the in_b_stride because for each iteration we compute 4 blocks of size 4x4
- win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 4 * in_b_stride));
- win_b.set(Window::DimY, Window::Dimension(0, 1, 0));
+ // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 4x4
+ win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 2 * in_b_stride));
+ win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
Iterator ina(input0, win_a);
Iterator inb(input1, win_b);
@@ -218,8 +371,6 @@
auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
auto mtx_b1 = mtx_b0 + in_b_stride;
- auto mtx_b2 = mtx_b1 + in_b_stride;
- auto mtx_b3 = mtx_b2 + in_b_stride;
float32x4_t acc00 = vdupq_n_f32(0.f);
float32x4_t acc10 = vdupq_n_f32(0.f);
@@ -231,55 +382,227 @@
float32x4_t acc21 = vdupq_n_f32(0.f);
float32x4_t acc31 = vdupq_n_f32(0.f);
- float32x4_t acc02 = vdupq_n_f32(0.f);
- float32x4_t acc12 = vdupq_n_f32(0.f);
- float32x4_t acc22 = vdupq_n_f32(0.f);
- float32x4_t acc32 = vdupq_n_f32(0.f);
+#if __arm__
+ asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+ asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+ asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
- float32x4_t acc03 = vdupq_n_f32(0.f);
- float32x4_t acc13 = vdupq_n_f32(0.f);
- float32x4_t acc23 = vdupq_n_f32(0.f);
- float32x4_t acc33 = vdupq_n_f32(0.f);
-
- for(int k = 0; k < num_elems_matrix_b_x; k += 4)
+ auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
+ for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
{
- const float32x4_t a = vld1q_f32(mtx_a0);
- const float32x2_t a00l = vget_low_f32(a);
- const float32x2_t a00h = vget_high_f32(a);
- const float32x4_t b00 = vld1q_f32(mtx_b0);
- const float32x4_t b10 = vld1q_f32(mtx_b1);
- const float32x4_t b20 = vld1q_f32(mtx_b2);
- const float32x4_t b30 = vld1q_f32(mtx_b3);
+ float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
+ float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
+ float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
+ float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+ float32x4_t b00 = vld1q_f32(mtx_b0);
+ float32x4_t b10 = vld1q_f32(mtx_b1);
+ float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
+ float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
+
+#if __arm__
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
// 4x4 block 0
- acc00 = vmlaq_lane_f32(acc00, b00, a00l, 0);
- acc10 = vmlaq_lane_f32(acc10, b00, a00l, 1);
- acc20 = vmlaq_lane_f32(acc20, b00, a00h, 0);
- acc30 = vmlaq_lane_f32(acc30, b00, a00h, 1);
+ acc00 = vmlaq_f32(acc00, b00, a0);
+ acc10 = vmlaq_f32(acc10, b00, a1);
+ acc20 = vmlaq_f32(acc20, b00, a2);
+ acc30 = vmlaq_f32(acc30, b00, a3);
+
+ float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
+ float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
+ float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
+ float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
// 4x4 block 1
- acc01 = vmlaq_lane_f32(acc01, b10, a00l, 0);
- acc11 = vmlaq_lane_f32(acc11, b10, a00l, 1);
- acc21 = vmlaq_lane_f32(acc21, b10, a00h, 0);
- acc31 = vmlaq_lane_f32(acc31, b10, a00h, 1);
+ acc01 = vmlaq_f32(acc01, b10, a0);
+ acc11 = vmlaq_f32(acc11, b10, a1);
+ acc21 = vmlaq_f32(acc21, b10, a2);
+ acc31 = vmlaq_f32(acc31, b10, a3);
- // 4x4 block 2
- acc02 = vmlaq_lane_f32(acc02, b20, a00l, 0);
- acc12 = vmlaq_lane_f32(acc12, b20, a00l, 1);
- acc22 = vmlaq_lane_f32(acc22, b20, a00h, 0);
- acc32 = vmlaq_lane_f32(acc32, b20, a00h, 1);
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b01, a4);
+ acc10 = vmlaq_f32(acc10, b01, a5);
+ acc20 = vmlaq_f32(acc20, b01, a6);
+ acc30 = vmlaq_f32(acc30, b01, a7);
- // 4x4 block 3
- acc03 = vmlaq_lane_f32(acc03, b30, a00l, 0);
- acc13 = vmlaq_lane_f32(acc13, b30, a00l, 1);
- acc23 = vmlaq_lane_f32(acc23, b30, a00h, 0);
- acc33 = vmlaq_lane_f32(acc33, b30, a00h, 1);
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b11, a4);
+ acc11 = vmlaq_f32(acc11, b11, a5);
+ acc21 = vmlaq_f32(acc21, b11, a6);
+ acc31 = vmlaq_f32(acc31, b11, a7);
+
+ mtx_a0 += 8;
+ mtx_b0 += 8;
+ mtx_b1 += 8;
+
+ a0 = vld1q_dup_f32(mtx_a0 + 0);
+ a1 = vld1q_dup_f32(mtx_a0 + 1);
+ a2 = vld1q_dup_f32(mtx_a0 + 2);
+ a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+ b00 = vld1q_f32(mtx_b0);
+ b10 = vld1q_f32(mtx_b1);
+ b01 = vld1q_f32(mtx_b0 + 4);
+ b11 = vld1q_f32(mtx_b1 + 4);
+
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b00, a0);
+ acc10 = vmlaq_f32(acc10, b00, a1);
+ acc20 = vmlaq_f32(acc20, b00, a2);
+ acc30 = vmlaq_f32(acc30, b00, a3);
+
+ a4 = vld1q_dup_f32(mtx_a0 + 4);
+ a5 = vld1q_dup_f32(mtx_a0 + 5);
+ a6 = vld1q_dup_f32(mtx_a0 + 6);
+ a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b10, a0);
+ acc11 = vmlaq_f32(acc11, b10, a1);
+ acc21 = vmlaq_f32(acc21, b10, a2);
+ acc31 = vmlaq_f32(acc31, b10, a3);
+
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b01, a4);
+ acc10 = vmlaq_f32(acc10, b01, a5);
+ acc20 = vmlaq_f32(acc20, b01, a6);
+ acc30 = vmlaq_f32(acc30, b01, a7);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b11, a4);
+ acc11 = vmlaq_f32(acc11, b11, a5);
+ acc21 = vmlaq_f32(acc21, b11, a6);
+ acc31 = vmlaq_f32(acc31, b11, a7);
+
+ mtx_a0 += 8;
+ mtx_b0 += 8;
+ mtx_b1 += 8;
+
+ a0 = vld1q_dup_f32(mtx_a0 + 0);
+ a1 = vld1q_dup_f32(mtx_a0 + 1);
+ a2 = vld1q_dup_f32(mtx_a0 + 2);
+ a3 = vld1q_dup_f32(mtx_a0 + 3);
+ b00 = vld1q_f32(mtx_b0);
+ b10 = vld1q_f32(mtx_b1);
+ b01 = vld1q_f32(mtx_b0 + 4);
+ b11 = vld1q_f32(mtx_b1 + 4);
+
+#if __arm__
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b00, a0);
+ acc10 = vmlaq_f32(acc10, b00, a1);
+ acc20 = vmlaq_f32(acc20, b00, a2);
+ acc30 = vmlaq_f32(acc30, b00, a3);
+
+ a4 = vld1q_dup_f32(mtx_a0 + 4);
+ a5 = vld1q_dup_f32(mtx_a0 + 5);
+ a6 = vld1q_dup_f32(mtx_a0 + 6);
+ a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b10, a0);
+ acc11 = vmlaq_f32(acc11, b10, a1);
+ acc21 = vmlaq_f32(acc21, b10, a2);
+ acc31 = vmlaq_f32(acc31, b10, a3);
+
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b01, a4);
+ acc10 = vmlaq_f32(acc10, b01, a5);
+ acc20 = vmlaq_f32(acc20, b01, a6);
+ acc30 = vmlaq_f32(acc30, b01, a7);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b11, a4);
+ acc11 = vmlaq_f32(acc11, b11, a5);
+ acc21 = vmlaq_f32(acc21, b11, a6);
+ acc31 = vmlaq_f32(acc31, b11, a7);
+
+ mtx_a0 += 8;
+ mtx_b0 += 8;
+ mtx_b1 += 8;
+
+ a0 = vld1q_dup_f32(mtx_a0 + 0);
+ a1 = vld1q_dup_f32(mtx_a0 + 1);
+ a2 = vld1q_dup_f32(mtx_a0 + 2);
+ a3 = vld1q_dup_f32(mtx_a0 + 3);
+ b00 = vld1q_f32(mtx_b0);
+ b10 = vld1q_f32(mtx_b1);
+ b01 = vld1q_f32(mtx_b0 + 4);
+ b11 = vld1q_f32(mtx_b1 + 4);
+
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b00, a0);
+ acc10 = vmlaq_f32(acc10, b00, a1);
+ acc20 = vmlaq_f32(acc20, b00, a2);
+ acc30 = vmlaq_f32(acc30, b00, a3);
+
+ a4 = vld1q_dup_f32(mtx_a0 + 4);
+ a5 = vld1q_dup_f32(mtx_a0 + 5);
+ a6 = vld1q_dup_f32(mtx_a0 + 6);
+ a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b10, a0);
+ acc11 = vmlaq_f32(acc11, b10, a1);
+ acc21 = vmlaq_f32(acc21, b10, a2);
+ acc31 = vmlaq_f32(acc31, b10, a3);
+
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b01, a4);
+ acc10 = vmlaq_f32(acc10, b01, a5);
+ acc20 = vmlaq_f32(acc20, b01, a6);
+ acc30 = vmlaq_f32(acc30, b01, a7);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b11, a4);
+ acc11 = vmlaq_f32(acc11, b11, a5);
+ acc21 = vmlaq_f32(acc21, b11, a6);
+ acc31 = vmlaq_f32(acc31, b11, a7);
+
+ mtx_a0 += 8;
+ mtx_b0 += 8;
+ mtx_b1 += 8;
+ }
+
+ for(; mtx_b0 < mtx_b0_end_addr;)
+ {
+ float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
+ float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
+ float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
+ float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
+ float32x4_t b00 = vld1q_f32(mtx_b0);
+ float32x4_t b10 = vld1q_f32(mtx_b1);
+
+#if __arm__
+ asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+ asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+ asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b00, a0);
+ acc10 = vmlaq_f32(acc10, b00, a1);
+ acc20 = vmlaq_f32(acc20, b00, a2);
+ acc30 = vmlaq_f32(acc30, b00, a3);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b10, a0);
+ acc11 = vmlaq_f32(acc11, b10, a1);
+ acc21 = vmlaq_f32(acc21, b10, a2);
+ acc31 = vmlaq_f32(acc31, b10, a3);
mtx_a0 += 4;
mtx_b0 += 4;
mtx_b1 += 4;
- mtx_b2 += 4;
- mtx_b3 += 4;
}
// Multiply by the weight of matrix product (alpha)
@@ -294,38 +617,20 @@
acc11 = vmulq_f32(acc11, alpha_f32);
acc21 = vmulq_f32(acc21, alpha_f32);
acc31 = vmulq_f32(acc31, alpha_f32);
- acc02 = vmulq_f32(acc02, alpha_f32);
- acc12 = vmulq_f32(acc12, alpha_f32);
- acc22 = vmulq_f32(acc22, alpha_f32);
- acc32 = vmulq_f32(acc32, alpha_f32);
- acc03 = vmulq_f32(acc03, alpha_f32);
- acc13 = vmulq_f32(acc13, alpha_f32);
- acc23 = vmulq_f32(acc23, alpha_f32);
- acc33 = vmulq_f32(acc33, alpha_f32);
}
const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
const auto mtx_out1 = mtx_out0 + 4;
- const auto mtx_out2 = mtx_out1 + 4;
- const auto mtx_out3 = mtx_out2 + 4;
// Store the 4 blocks
vst1q_f32(mtx_out0, acc00);
vst1q_f32(mtx_out1, acc01);
- vst1q_f32(mtx_out2, acc02);
- vst1q_f32(mtx_out3, acc03);
vst1q_f32(mtx_out0 + out_stride1, acc10);
vst1q_f32(mtx_out1 + out_stride1, acc11);
- vst1q_f32(mtx_out2 + out_stride1, acc12);
- vst1q_f32(mtx_out3 + out_stride1, acc13);
vst1q_f32(mtx_out0 + out_stride2, acc20);
vst1q_f32(mtx_out1 + out_stride2, acc21);
- vst1q_f32(mtx_out2 + out_stride2, acc22);
- vst1q_f32(mtx_out3 + out_stride2, acc23);
vst1q_f32(mtx_out0 + out_stride3, acc30);
vst1q_f32(mtx_out1 + out_stride3, acc31);
- vst1q_f32(mtx_out2 + out_stride3, acc32);
- vst1q_f32(mtx_out3 + out_stride3, acc33);
},
ina, inb, out);
}
@@ -453,6 +758,240 @@
ARM_COMPUTE_ERROR("Not implemented");
#endif
}
+
+template <bool multiply_alpha>
+void matrix_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+ const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
+ const size_t out_stride1 = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
+ const size_t out_stride2 = out_stride1 * 2;
+ const size_t out_stride3 = out_stride1 * 3;
+ const int num_elems_matrix_b_x = input1->info()->dimension(0);
+ const int fixed_point_position = input0->info()->fixed_point_position();
+ const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+ ARM_COMPUTE_UNUSED(alpha_qs8);
+
+ // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+ Window win_a(window);
+ win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
+
+ Window win_b;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+ if(input1->info()->num_dimensions() >= 3)
+ {
+ win_b = window;
+ }
+ // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix
+ // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 16x4
+ win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, 2 * in_b_stride));
+ win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ Iterator ina(input0, win_a);
+ Iterator inb(input1, win_b);
+ Iterator out(output, window);
+
+ // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
+ // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
+ // All the values needed for computing a single 32x4 block will be read from consecutive memory positions
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ auto mtx_a0 = reinterpret_cast<const qint8_t *>(ina.ptr());
+ auto mtx_b0 = reinterpret_cast<const qint8_t *>(inb.ptr());
+ auto mtx_b1 = mtx_b0 + in_b_stride;
+
+ qint16x8_t acc00_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc10_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc20_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc30_qs16 = vdupq_n_qs16(0);
+
+ qint16x8_t acc01_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc11_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc21_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc31_qs16 = vdupq_n_qs16(0);
+
+ qint16x8_t acc02_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc12_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc22_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc32_qs16 = vdupq_n_qs16(0);
+
+ qint16x8_t acc03_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc13_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc23_qs16 = vdupq_n_qs16(0);
+ qint16x8_t acc33_qs16 = vdupq_n_qs16(0);
+
+ int k = 0;
+ // This for loop performs 2 accumulations
+ for(; k <= (num_elems_matrix_b_x - 32); k += 32)
+ {
+ const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0);
+ const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1);
+ const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2);
+ const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3);
+ const qint8x8_t a4 = vld1_dup_qs8(mtx_a0 + 4);
+ const qint8x8_t a5 = vld1_dup_qs8(mtx_a0 + 5);
+ const qint8x8_t a6 = vld1_dup_qs8(mtx_a0 + 6);
+ const qint8x8_t a7 = vld1_dup_qs8(mtx_a0 + 7);
+
+ const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0);
+ const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8);
+ const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0);
+ const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8);
+
+ // First accumulation
+ acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+ acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position);
+ acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position);
+ acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position);
+ acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position);
+ acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position);
+ acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position);
+ acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position);
+
+ const qint8x8_t b02 = vld1_qs8(mtx_b0 + 16);
+ const qint8x8_t b03 = vld1_qs8(mtx_b0 + 24);
+ const qint8x8_t b12 = vld1_qs8(mtx_b1 + 16);
+ const qint8x8_t b13 = vld1_qs8(mtx_b1 + 24);
+
+ acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+ acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position);
+ acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position);
+ acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position);
+ acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position);
+ acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position);
+ acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position);
+ acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position);
+
+#if __arm__
+ asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+ asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+ asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+
+ // Second accumulation
+ acc00_qs16 = vqmlal_qs8(acc00_qs16, b02, a4, fixed_point_position);
+ acc10_qs16 = vqmlal_qs8(acc10_qs16, b02, a5, fixed_point_position);
+ acc20_qs16 = vqmlal_qs8(acc20_qs16, b02, a6, fixed_point_position);
+ acc30_qs16 = vqmlal_qs8(acc30_qs16, b02, a7, fixed_point_position);
+ acc01_qs16 = vqmlal_qs8(acc01_qs16, b03, a4, fixed_point_position);
+ acc11_qs16 = vqmlal_qs8(acc11_qs16, b03, a5, fixed_point_position);
+ acc21_qs16 = vqmlal_qs8(acc21_qs16, b03, a6, fixed_point_position);
+ acc31_qs16 = vqmlal_qs8(acc31_qs16, b03, a7, fixed_point_position);
+ acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a4, fixed_point_position);
+ acc12_qs16 = vqmlal_qs8(acc12_qs16, b12, a5, fixed_point_position);
+ acc22_qs16 = vqmlal_qs8(acc22_qs16, b12, a6, fixed_point_position);
+ acc32_qs16 = vqmlal_qs8(acc32_qs16, b12, a7, fixed_point_position);
+ acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a4, fixed_point_position);
+ acc13_qs16 = vqmlal_qs8(acc13_qs16, b13, a5, fixed_point_position);
+ acc23_qs16 = vqmlal_qs8(acc23_qs16, b13, a6, fixed_point_position);
+ acc33_qs16 = vqmlal_qs8(acc33_qs16, b13, a7, fixed_point_position);
+
+ mtx_a0 += 8;
+ mtx_b0 += 32;
+ mtx_b1 += 32;
+ }
+
+ // This for loop performs the left over accumulations
+ for(; k < num_elems_matrix_b_x; k += 16)
+ {
+ const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0);
+ const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1);
+ const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2);
+ const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3);
+
+ const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0);
+ const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8);
+ const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0);
+ const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8);
+
+ acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+ acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position);
+ acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position);
+ acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position);
+ acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+ acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position);
+ acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position);
+ acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position);
+ acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position);
+ acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position);
+ acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position);
+ acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position);
+ acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position);
+ acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position);
+ acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position);
+ acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position);
+
+ mtx_a0 += 4;
+ mtx_b0 += 16;
+ mtx_b1 += 16;
+ }
+
+ // Convert back to qint8x8_t and saturate
+ qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16);
+ qint8x8_t acc10_qs8 = vqmovn_qs16(acc10_qs16);
+ qint8x8_t acc20_qs8 = vqmovn_qs16(acc20_qs16);
+ qint8x8_t acc30_qs8 = vqmovn_qs16(acc30_qs16);
+
+ qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16);
+ qint8x8_t acc11_qs8 = vqmovn_qs16(acc11_qs16);
+ qint8x8_t acc21_qs8 = vqmovn_qs16(acc21_qs16);
+ qint8x8_t acc31_qs8 = vqmovn_qs16(acc31_qs16);
+
+ qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16);
+ qint8x8_t acc12_qs8 = vqmovn_qs16(acc12_qs16);
+ qint8x8_t acc22_qs8 = vqmovn_qs16(acc22_qs16);
+ qint8x8_t acc32_qs8 = vqmovn_qs16(acc32_qs16);
+
+ qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16);
+ qint8x8_t acc13_qs8 = vqmovn_qs16(acc13_qs16);
+ qint8x8_t acc23_qs8 = vqmovn_qs16(acc23_qs16);
+ qint8x8_t acc33_qs8 = vqmovn_qs16(acc33_qs16);
+
+ // Multiply by the weight of the matrix product (alpha)
+ if(multiply_alpha)
+ {
+ acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
+ acc10_qs8 = vqmul_qs8(acc10_qs8, alpha_qs8, fixed_point_position);
+ acc20_qs8 = vqmul_qs8(acc20_qs8, alpha_qs8, fixed_point_position);
+ acc30_qs8 = vqmul_qs8(acc30_qs8, alpha_qs8, fixed_point_position);
+ acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
+ acc11_qs8 = vqmul_qs8(acc11_qs8, alpha_qs8, fixed_point_position);
+ acc21_qs8 = vqmul_qs8(acc21_qs8, alpha_qs8, fixed_point_position);
+ acc31_qs8 = vqmul_qs8(acc31_qs8, alpha_qs8, fixed_point_position);
+ acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
+ acc12_qs8 = vqmul_qs8(acc12_qs8, alpha_qs8, fixed_point_position);
+ acc22_qs8 = vqmul_qs8(acc22_qs8, alpha_qs8, fixed_point_position);
+ acc32_qs8 = vqmul_qs8(acc32_qs8, alpha_qs8, fixed_point_position);
+ acc03_qs8 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position);
+ acc13_qs8 = vqmul_qs8(acc13_qs8, alpha_qs8, fixed_point_position);
+ acc23_qs8 = vqmul_qs8(acc23_qs8, alpha_qs8, fixed_point_position);
+ acc33_qs8 = vqmul_qs8(acc33_qs8, alpha_qs8, fixed_point_position);
+ }
+
+ const auto mtx_out0 = reinterpret_cast<qint8_t *>(out.ptr());
+
+ // Store 32x4 output elements
+ vst1_qs8(mtx_out0 + 0, acc00_qs8);
+ vst1_qs8(mtx_out0 + 8, acc01_qs8);
+ vst1_qs8(mtx_out0 + 16, acc02_qs8);
+ vst1_qs8(mtx_out0 + 24, acc03_qs8);
+ vst1_qs8(mtx_out0 + out_stride1 + 0, acc10_qs8);
+ vst1_qs8(mtx_out0 + out_stride1 + 8, acc11_qs8);
+ vst1_qs8(mtx_out0 + out_stride1 + 16, acc12_qs8);
+ vst1_qs8(mtx_out0 + out_stride1 + 24, acc13_qs8);
+ vst1_qs8(mtx_out0 + out_stride2 + 0, acc20_qs8);
+ vst1_qs8(mtx_out0 + out_stride2 + 8, acc21_qs8);
+ vst1_qs8(mtx_out0 + out_stride2 + 16, acc22_qs8);
+ vst1_qs8(mtx_out0 + out_stride2 + 24, acc23_qs8);
+ vst1_qs8(mtx_out0 + out_stride3 + 0, acc30_qs8);
+ vst1_qs8(mtx_out0 + out_stride3 + 8, acc31_qs8);
+ vst1_qs8(mtx_out0 + out_stride3 + 16, acc32_qs8);
+ vst1_qs8(mtx_out0 + out_stride3 + 24, acc33_qs8);
+ },
+ ina, inb, out);
+}
+
} // namespace
NEGEMMMatrixMultiplyKernel::NEGEMMMatrixMultiplyKernel()
@@ -462,10 +1001,13 @@
void NEGEMMMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
if(output->info()->dimension(1) == 1)
{
ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
@@ -479,10 +1021,27 @@
unsigned int num_elems_processed_per_iteration_x = 0;
const unsigned int num_elems_processed_per_iteration_y = 4;
- // Check if the output tensor is a vector and the data type is F32. If so,the kernel runs the vector-matrix multiplication
- if((output->info()->dimension(1) == 1) && (input0->info()->data_type() == DataType::F32))
+ // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+ if((output->info()->dimension(1) == 1))
{
- num_elems_processed_per_iteration_x = 16;
+ switch(input0->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ num_elems_processed_per_iteration_x = 16;
+ break;
+ }
+ case DataType::QS8:
+ {
+ num_elems_processed_per_iteration_x = 32;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
+ }
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
@@ -494,7 +1053,9 @@
AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
INEKernel::configure(win);
}
@@ -502,16 +1063,23 @@
{
switch(input0->info()->data_type())
{
- case DataType::F16:
+ case DataType::F32:
{
num_elems_processed_per_iteration_x = 8;
break;
}
- case DataType::F32:
+ case DataType::QS8:
{
- num_elems_processed_per_iteration_x = 16;
+ num_elems_processed_per_iteration_x = 32;
break;
}
+ case DataType::F16:
+ {
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ num_elems_processed_per_iteration_x = 8;
+ break;
+#endif
+ }
default:
{
ARM_COMPUTE_ERROR("Data type not supported");
@@ -543,45 +1111,53 @@
bool multiply_alpha = std::abs(1.0f - _alpha) > 0.00001f;
// Check if the output tensor is a vector and the data type is F32. If so,the kernel runs the vector-matrix multiplication
- if((_output->info()->dimension(1) == 1) && (_input0->info()->data_type() == DataType::F32))
+ if((_output->info()->dimension(1) == 1))
{
- if(multiply_alpha)
+ switch(_input0->info()->data_type())
{
- vector_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha);
- }
- else
- {
- vector_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
+ case DataType::F32:
+ {
+ multiply_alpha ? vector_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha) :
+ vector_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
+ break;
+ }
+ case DataType::QS8:
+ {
+ multiply_alpha ? vector_matrix_multiply_qs8<true>(_input0, _input1, _output, window, _alpha) :
+ vector_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
}
}
else
{
switch(_input0->info()->data_type())
{
- case DataType::F16:
- {
- if(multiply_alpha)
- {
- matrix_matrix_multiply_f16<true>(_input0, _input1, _output, window, _alpha);
- }
- else
- {
- matrix_matrix_multiply_f16<false>(_input0, _input1, _output, window, _alpha);
- }
- break;
- }
case DataType::F32:
{
- if(multiply_alpha)
- {
- matrix_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha);
- }
- else
- {
- matrix_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
- }
+ multiply_alpha ? matrix_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha) :
+ matrix_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
break;
}
+ case DataType::QS8:
+ {
+ multiply_alpha ? matrix_matrix_multiply_qs8<true>(_input0, _input1, _output, window, _alpha) :
+ matrix_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);
+ break;
+ }
+ case DataType::F16:
+ {
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ multiply_alpha ? matrix_matrix_multiply_f16<true>(_input0, _input1, _output, window, _alpha) :
+ matrix_matrix_multiply_f16<false>(_input0, _input1, _output, window, _alpha);
+ break;
+#endif
+ }
default:
{
ARM_COMPUTE_ERROR("Data type not supported");
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 1dbc07a..ccf5cb4 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -30,6 +30,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/INEKernel.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
@@ -42,33 +43,22 @@
void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ const size_t transpose_w = 16 / input->info()->element_size();
+ output_shape.set(0, input->info()->dimension(1) * transpose_w);
+ output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(input->info()->dimension(0) / 8.0f)) && (input->info()->data_type() == DataType::F16));
- ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(input->info()->dimension(0) / 4.0f)) && (input->info()->data_type() == DataType::F32));
- ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(input->info()->dimension(0) / 4.0f)) && (input->info()->data_type() == DataType::U8));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
- unsigned int num_elems_processed_per_iteration = 0;
- float scale_x = 1.f;
-
- switch(input->info()->data_type())
- {
- case DataType::F32:
- case DataType::U8:
- num_elems_processed_per_iteration = 4;
- scale_x = 4.f;
- break;
- case DataType::F16:
-#ifdef ARM_COMPUTE_ENABLE_FP16
- num_elems_processed_per_iteration = 8;
- scale_x = 8.f;
- break;
-#endif
- default:
- ARM_COMPUTE_ERROR("Data type not supported");
- break;
- }
+ const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+ const float scale_x = num_elems_processed_per_iteration;
_input = input;
_output = output;
@@ -99,11 +89,10 @@
* |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
* |a30 a31 a32 a33|
*
- * If the input data type is F32, the output matrix will have the following shape: [ height * 4, width / 4 ]
- * If the input data type is F16, the output matrix will have the following shape: [ height * 8, width / 8 ]
+ * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
*/
- /* Set window for output tensor. Set to 0 the X and Y dimensions in order to allow multi-threading implementation and future batched matrix multiplications. */
+ // Set window for output tensor. Set to 0 the X and Y dimensions in order to allow multi-threading implementation and future batched matrix multiplications
Window win_out(window);
win_out.set(Window::DimX, Window::Dimension(0, 0, 0));
win_out.set(Window::DimY, Window::Dimension(0, 0, 0));
@@ -111,56 +100,50 @@
Iterator in(_input, window);
Iterator out(_output, win_out);
- switch(_input->info()->data_type())
+ switch(_input->info()->element_size())
{
- case DataType::F32:
+ case 1:
+ {
+ const size_t out_stride = _output->info()->strides_in_bytes()[1];
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Output address = base addr + (y * 16) + (x / 16 ) * stride
+ const uint8_t *in_ptr = in.ptr();
+ uint8_t *const out_ptr = out.ptr() + (id.y() << 4) + (id.x() >> 4) * out_stride;
+ vst1q_u8(out_ptr, vld1q_u8(in_ptr));
+ },
+ in, out);
+ break;
+ }
+ case 2:
+ {
+ const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(int16_t);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Output address = base addr + (y * 8) + (x / 8 ) * stride
+ const auto in_ptr = reinterpret_cast<const uint16_t *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<uint16_t *>(out.ptr()) + (id.y() << 3) + (id.x() >> 3) * out_stride;
+ vst1q_u16(out_ptr, vld1q_u16(in_ptr));
+ },
+ in, out);
+ break;
+ }
+ case 4:
{
const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(float);
-
execute_window_loop(window, [&](const Coordinates & id)
{
- const auto in_ptr = reinterpret_cast<const float *>(in.ptr());
- const float32x4_t data = vld1q_f32(in_ptr);
- /* Output address = base addr + (y * 4) + (x / 4 ) * stride */
- const auto out_ptr = reinterpret_cast<float *>(out.ptr()) + (id.y() << 2) + (id.x() >> 2) * out_stride;
- vst1q_f32(out_ptr, data);
+ // Output address = base addr + (y * 4) + (x / 4 ) * stride
+ const auto in_ptr = reinterpret_cast<const uint32_t *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<uint32_t *>(out.ptr()) + (id.y() << 2) + (id.x() >> 2) * out_stride;
+ vst1q_u32(out_ptr, vld1q_u32(in_ptr));
},
in, out);
break;
}
- case DataType::U8:
- {
- const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(uint8_t);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
- /* Output address = base addr + (y * 4) + (x / 4 ) * stride */
- const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr()) + (id.y() << 2) + (id.x() >> 2) * out_stride;
- std::copy_n(in_ptr, 4, out_ptr);
- },
- in, out);
- break;
- }
-
- case DataType::F16:
-#ifdef ARM_COMPUTE_ENABLE_FP16
- {
- const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(float16_t);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto in_ptr = reinterpret_cast<const float16_t *>(in.ptr());
- // Output address = base addr + (y * 8) + (x / 8 ) * stride
- float16_t *out_ptr = reinterpret_cast<float16_t *>(out.ptr()) + (id.y() << 3) + (id.x() >> 3) * out_stride;
- vst1q_f16(out_ptr, vld1q_f16(in_ptr));
- },
- in, out);
- break;
- }
-#endif
default:
{
- ARM_COMPUTE_ERROR("Data type not supported");
+ ARM_COMPUTE_ERROR("Element size not supported");
break;
}
}
diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
index 7bc4534..404ad8a 100644
--- a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
@@ -46,7 +46,6 @@
static const int32x4_t zero_s32 = vdupq_n_s32(0);
static const int32x4_t one_s32 = vdupq_n_s32(1);
const int32x4_t num_bins_s32 = vdupq_n_s32(num_bins);
- const int32x4_t max_hidx_s32 = vdupq_n_s32(num_bins - 1);
memset(output_ptr, 0, sizeof(float) * num_bins);
@@ -70,14 +69,10 @@
// Compute histogram index.
int32x4_t hidx_s32 = vcvtq_s32_f32(phase_f32);
- // Check if the histogram index is equal to num_bins. If so, replace the index with max_hidx
- uint32x4_t mask = vceqq_s32(hidx_s32, num_bins_s32);
- hidx_s32 = vbslq_s32(mask, max_hidx_s32, hidx_s32);
-
// Compute magnitude weights (w0 and w1)
const float32x4_t hidx_f32 = vcvtq_f32_s32(hidx_s32);
- // w1 = phase_f32 - hidx_s32
+ // w1 = phase_f32 - hidx_f32
const float32x4_t w1_f32 = vsubq_f32(phase_f32, hidx_f32);
// w0 = 1.0 - w1
@@ -89,6 +84,10 @@
// Weighted vote between 2 bins
+ // Check if the histogram index is equal to num_bins. If so, replace the index with 0
+ uint32x4_t mask = vceqq_s32(hidx_s32, num_bins_s32);
+ hidx_s32 = vbslq_s32(mask, zero_s32, hidx_s32);
+
// Bin 0
*(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w0_f32, 0);
*(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w0_f32, 1);
@@ -108,11 +107,12 @@
*(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w1_f32, 3);
}
- for(; xc < static_cast<int32_t>(cell_width); xc++)
+ for(; xc < static_cast<int32_t>(cell_width); ++xc)
{
const float phase_value = *(phase_row_ptr + xc + yc * phase_stride) * phase_scale + 0.5f;
const float mag_value = *(mag_row_ptr + xc + yc * mag_stride);
- const float w1 = phase_value - std::floor(phase_value);
+
+ const float w1 = phase_value - std::floor(phase_value);
// The quantised phase is the histogram index [0, num_bins - 1] - Round
// Check limit of histogram index. If hidx == num_bins, hidx = 0
@@ -120,7 +120,7 @@
// Weighted vote between 2 bins
*(output_ptr + hidx) += mag_value * (1.0f - w1);
- *(output_ptr + ((hidx + 1) % num_bins)) += mag_value * w1;
+ *(output_ptr + ((hidx + 1) % (num_bins))) += mag_value * w1;
}
}
}
@@ -329,6 +329,7 @@
vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
}
+ // Compute left over
for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
{
const float input_value = hist_ptr[xc];
@@ -416,7 +417,8 @@
vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
}
- for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
+ // Compute left over
+ for(; xc < static_cast<int32_t>(num_bins_block_x); ++xc)
{
const float input_value = hist_ptr[xc];
@@ -431,9 +433,9 @@
sum += vgetq_lane_f32(sum_f32, 2);
sum += vgetq_lane_f32(sum_f32, 3);
- float scale = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
- float32x4_t scale_f32 = vdupq_n_f32(scale);
- const float32x4_t thres_l2hys_f32 = vdupq_n_f32(l2_hyst_threshold);
+ float scale = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
+ float32x4_t scale_f32 = vdupq_n_f32(scale);
+ const float32x4_t l2_hyst_threshold_f32 = vdupq_n_f32(l2_hyst_threshold);
// Reset sum
sum_f32 = vdupq_n_f32(0.0f);
@@ -460,10 +462,10 @@
input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
// Clip input_value if over _threshold_l2hys
- input_value.val[0] = vminq_f32(input_value.val[0], thres_l2hys_f32);
- input_value.val[1] = vminq_f32(input_value.val[1], thres_l2hys_f32);
- input_value.val[2] = vminq_f32(input_value.val[2], thres_l2hys_f32);
- input_value.val[3] = vminq_f32(input_value.val[3], thres_l2hys_f32);
+ input_value.val[0] = vminq_f32(input_value.val[0], l2_hyst_threshold_f32);
+ input_value.val[1] = vminq_f32(input_value.val[1], l2_hyst_threshold_f32);
+ input_value.val[2] = vminq_f32(input_value.val[2], l2_hyst_threshold_f32);
+ input_value.val[3] = vminq_f32(input_value.val[3], l2_hyst_threshold_f32);
// Compute input_value^2
sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
@@ -742,7 +744,6 @@
case HOGNormType::L1_NORM:
_func = &l1_norm;
break;
- case HOGNormType::L1SQRT_NORM:
default:
ARM_COMPUTE_ERROR_ON("Normalisation type not supported");
break;
@@ -773,11 +774,11 @@
ARM_COMPUTE_ERROR_ON(_func == nullptr);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- // Get number of element per block
- const size_t num_elem_block = _output->info()->num_channels();
+ // Get number of bins per block
+ const size_t num_bins_per_block = _output->info()->num_channels();
// Number of bins on the same row of the block
- const int32_t num_bins_block_x = _num_cells_per_block.width * _num_bins;
+ const int32_t num_bins_per_block_x = _num_cells_per_block.width * _num_bins;
const size_t input_stride = _input->info()->strides_in_bytes()[Window::DimY] / data_size_from_type(_input->info()->data_type());
@@ -795,7 +796,7 @@
const auto out_row_ptr = reinterpret_cast<float *>(out.ptr());
// Execute normalization function
- (*_func)(input_row_ptr, out_row_ptr, input_stride, _num_cells_per_block.height, num_bins_block_x, num_elem_block, _l2_hyst_threshold);
+ (*_func)(input_row_ptr, out_row_ptr, input_stride, _num_cells_per_block.height, num_bins_per_block_x, num_bins_per_block, _l2_hyst_threshold);
},
in, out);
}
diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
index c71d427..4af22bc 100644
--- a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
@@ -35,13 +35,13 @@
NEHOGDetectorKernel::NEHOGDetectorKernel()
: _input(nullptr), _detection_windows(), _hog_descriptor(nullptr), _bias(0.0f), _threshold(0.0f), _idx_class(0), _num_bins_per_descriptor_x(0), _num_blocks_per_descriptor_y(0), _block_stride_width(0),
- _block_stride_height(0), _detection_window_width(0), _detection_window_height(0), _mutex()
+ _block_stride_height(0), _detection_window_width(0), _detection_window_height(0), _max_num_detection_windows(0), _mutex()
{
}
void NEHOGDetectorKernel::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, uint16_t idx_class)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32);
ARM_COMPUTE_ERROR_ON(hog == nullptr);
ARM_COMPUTE_ERROR_ON(detection_windows == nullptr);
ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0);
@@ -63,6 +63,9 @@
_block_stride_height = block_stride.height;
_detection_window_width = detection_window_size.width;
_detection_window_height = detection_window_size.height;
+ _max_num_detection_windows = detection_windows->max_num_values();
+
+ ARM_COMPUTE_ERROR_ON((_num_bins_per_descriptor_x * _num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size());
// Get the number of blocks along the x and y directions of the input tensor
const ValidRegion &valid_region = input->info()->valid_region();
@@ -81,8 +84,8 @@
win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x));
win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y));
- const unsigned int num_elems_read_per_iteration = _num_bins_per_descriptor_x;
- const unsigned int num_rows_read_per_iteration = _num_blocks_per_descriptor_y;
+ constexpr unsigned int num_elems_read_per_iteration = 1;
+ const unsigned int num_rows_read_per_iteration = _num_blocks_per_descriptor_y;
update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration));
@@ -163,17 +166,20 @@
if(score > _threshold)
{
- DetectionWindow win;
- win.x = (id.x() * _block_stride_width);
- win.y = (id.y() * _block_stride_height);
- win.width = _detection_window_width;
- win.height = _detection_window_height;
- win.idx_class = _idx_class;
- win.score = score;
+ if(_detection_windows->num_values() < _max_num_detection_windows)
+ {
+ DetectionWindow win;
+ win.x = (id.x() * _block_stride_width);
+ win.y = (id.y() * _block_stride_height);
+ win.width = _detection_window_width;
+ win.height = _detection_window_height;
+ win.idx_class = _idx_class;
+ win.score = score;
- std::unique_lock<std::mutex> lock(_mutex);
- _detection_windows->push_back(win);
- lock.unlock();
+ std::unique_lock<std::mutex> lock(_mutex);
+ _detection_windows->push_back(win);
+ lock.unlock();
+ }
}
},
in);
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 56797f0..585676b 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
@@ -51,39 +51,84 @@
{
static const float16x8_t zero = vdupq_n_f16(0.f);
- /* Trace^2 */
+ // Trace^2
float16x8_t trace2 = vaddq_f16(gx2, gy2);
trace2 = vmulq_f16(trace2, trace2);
- /* Det(A) */
+ // Det(A)
float16x8_t det = vmulq_f16(gx2, gy2);
det = vfmsq_f16(det, gxgy, gxgy);
- /* Det(A) - sensitivity * trace^2 */
+ // Det(A) - sensitivity * trace^2
const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2);
- /* mc > strength_thresh */
+ // mc > strength_thresh
const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh));
return vbslq_f16(mask, mc, zero);
}
template <size_t block_size>
-inline void harris_score_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
- float norm_factor)
+inline void harris_score1xN_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
+ float norm_factor)
{
const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor);
- /* Normalize */
+ // Normalize
low_gx = vmulq_f16(low_gx, norm_factor_fp16);
low_gy = vmulq_f16(low_gy, norm_factor_fp16);
high_gx = vmulq_f16(high_gx, norm_factor_fp16);
high_gy = vmulq_f16(high_gy, norm_factor_fp16);
- for(size_t i = 0; i < block_size; ++i)
+ float16x8_t gx = vextq_f16(low_gx, high_gx, 0);
+ float16x8_t gy = vextq_f16(low_gy, high_gy, 0);
+
+ gx2 = vfmaq_f16(gx2, gx, gx);
+ gy2 = vfmaq_f16(gy2, gy, gy);
+ gxgy = vfmaq_f16(gxgy, gx, gy);
+
+ gx = vextq_f16(low_gx, high_gx, 1);
+ gy = vextq_f16(low_gy, high_gy, 1);
+
+ gx2 = vfmaq_f16(gx2, gx, gx);
+ gy2 = vfmaq_f16(gy2, gy, gy);
+ gxgy = vfmaq_f16(gxgy, gx, gy);
+
+ gx = vextq_f16(low_gx, high_gx, 2);
+ gy = vextq_f16(low_gy, high_gy, 2);
+
+ gx2 = vfmaq_f16(gx2, gx, gx);
+ gy2 = vfmaq_f16(gy2, gy, gy);
+ gxgy = vfmaq_f16(gxgy, gx, gy);
+
+ if(block_size > 3)
{
- const float16x8_t gx = vextq_f16(low_gx, high_gx, i);
- const float16x8_t gy = vextq_f16(low_gy, high_gy, i);
+ gx = vextq_f16(low_gx, high_gx, 3);
+ gy = vextq_f16(low_gy, high_gy, 3);
+
+ gx2 = vfmaq_f16(gx2, gx, gx);
+ gy2 = vfmaq_f16(gy2, gy, gy);
+ gxgy = vfmaq_f16(gxgy, gx, gy);
+
+ gx = vextq_f16(low_gx, high_gx, 4);
+ gy = vextq_f16(low_gy, high_gy, 4);
+
+ gx2 = vfmaq_f16(gx2, gx, gx);
+ gy2 = vfmaq_f16(gy2, gy, gy);
+ gxgy = vfmaq_f16(gxgy, gx, gy);
+ }
+
+ if(block_size == 7)
+ {
+ gx = vextq_f16(low_gx, high_gx, 5);
+ gy = vextq_f16(low_gy, high_gy, 5);
+
+ gx2 = vfmaq_f16(gx2, gx, gx);
+ gy2 = vfmaq_f16(gy2, gy, gy);
+ gxgy = vfmaq_f16(gxgy, gx, gy);
+
+ gx = vextq_f16(low_gx, high_gx, 6);
+ gy = vextq_f16(low_gy, high_gy, 6);
gx2 = vfmaq_f16(gx2, gx, gx);
gy2 = vfmaq_f16(gy2, gy, gy);
@@ -101,7 +146,7 @@
const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
const auto output = static_cast<float *__restrict>(out_ptr);
- /* Gx^2, Gy^2 and Gx*Gy */
+ // Gx^2, Gy^2 and Gx*Gy
float16x8_t gx2 = vdupq_n_f16(0.0f);
float16x8_t gy2 = vdupq_n_f16(0.0f);
float16x8_t gxgy = vdupq_n_f16(0.0f);
@@ -112,19 +157,19 @@
const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1));
const float16x8_t low_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_0));
const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1));
- harris_score_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
+ harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
- /* Update gx and gy pointer */
+ // Update gx and gy pointer
gx_ptr_0 += in_stride;
gy_ptr_0 += in_stride;
gx_ptr_1 += in_stride;
gy_ptr_1 += in_stride;
}
- /* Calculate harris score */
+ // Calculate harris score
const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
- /* Store score */
+ // Store score
vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
}
@@ -143,7 +188,7 @@
const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
const auto output = static_cast<float *__restrict>(out_ptr);
- /* Gx^2, Gy^2 and Gx*Gy */
+ // Gx^2, Gy^2 and Gx*Gy
float16x8_t gx2 = zero;
float16x8_t gy2 = zero;
float16x8_t gxgy = zero;
@@ -158,9 +203,9 @@
vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
vget_low_f16(zero));
- harris_score_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
+ harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
- /* Update gx and gy pointer */
+ // Update gx and gy pointer
gx_ptr_0 += in_stride;
gy_ptr_0 += in_stride;
gx_ptr_1 += in_stride;
@@ -169,10 +214,10 @@
gy_ptr_2 += in_stride;
}
- /* Calculate harris score */
+ // Calculate harris score
const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
- /* Store score */
+ // Store score
vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
}
@@ -193,7 +238,7 @@
const int32_t *gy_ptr_3 = gy_ptr_0 + 12;
const auto output = static_cast<float *__restrict>(out_ptr);
- /* Gx^2, Gy^2 and Gx*Gy */
+ // Gx^2, Gy^2 and Gx*Gy
float16x8_t gx2 = zero;
float16x8_t gy2 = zero;
float16x8_t gxgy = zero;
@@ -208,9 +253,9 @@
vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3))));
- harris_score_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
+ harris_score1xN_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
- /* Update gx and gy pointer */
+ // Update gx and gy pointer
gx_ptr_0 += in_stride;
gy_ptr_0 += in_stride;
gx_ptr_1 += in_stride;
@@ -219,10 +264,10 @@
gy_ptr_2 += in_stride;
}
- /* Calculate harris score */
+ // Calculate harris score
const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
- /* Store score */
+ // Store score
vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
}
@@ -328,18 +373,18 @@
{
inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
{
- /* Trace^2 */
+ // Trace^2
float32x4_t trace2 = vaddq_f32(gx2, gy2);
trace2 = vmulq_f32(trace2, trace2);
- /* Det(A) */
+ // Det(A)
float32x4_t det = vmulq_f32(gx2, gy2);
det = vmlsq_f32(det, gxgy, gxgy);
- /* Det(A) - sensitivity * trace^2 */
+ // Det(A) - sensitivity * trace^2
const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
- /* mc > strength_thresh */
+ // mc > strength_thresh
const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
@@ -348,7 +393,7 @@
inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
float32x4_t norm_factor)
{
- /* Normalize */
+ // Normalize
low_gx = vmulq_f32(low_gx, norm_factor);
low_gy = vmulq_f32(low_gy, norm_factor);
high_gx = vmulq_f32(high_gx, norm_factor);
@@ -361,17 +406,17 @@
const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
- /* Gx*Gx*/
+ // Gx*Gx
gx2 = vmlaq_f32(gx2, l_gx, l_gx);
gx2 = vmlaq_f32(gx2, m_gx, m_gx);
gx2 = vmlaq_f32(gx2, r_gx, r_gx);
- /* Gy*Gy*/
+ // Gy*Gy
gy2 = vmlaq_f32(gy2, l_gy, l_gy);
gy2 = vmlaq_f32(gy2, m_gy, m_gy);
gy2 = vmlaq_f32(gy2, r_gy, r_gy);
- /* Gx*Gy */
+ // Gx*Gy
gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
@@ -380,53 +425,53 @@
inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
float32x4_t norm_factor)
{
- /* Normalize */
+ // Normalize
low_gx = vmulq_f32(low_gx, norm_factor);
low_gy = vmulq_f32(low_gy, norm_factor);
high_gx = vmulq_f32(high_gx, norm_factor);
high_gy = vmulq_f32(high_gy, norm_factor);
- /* L2 values */
+ // L2 values
float32x4_t gx = low_gx;
float32x4_t gy = low_gy;
- /* Accumulate */
+ // Accumulate
gx2 = vmlaq_f32(gx2, gx, gx);
gy2 = vmlaq_f32(gy2, gy, gy);
gxgy = vmlaq_f32(gxgy, gx, gy);
- /* L1 values */
+ // L1 values
gx = vextq_f32(low_gx, high_gx, 1);
gy = vextq_f32(low_gy, high_gy, 1);
- /* Accumulate */
+ // Accumulate
gx2 = vmlaq_f32(gx2, gx, gx);
gy2 = vmlaq_f32(gy2, gy, gy);
gxgy = vmlaq_f32(gxgy, gx, gy);
- /* M values */
+ // M values
gx = vextq_f32(low_gx, high_gx, 2);
gy = vextq_f32(low_gy, high_gy, 2);
- /* Accumulate */
+ // Accumulate
gx2 = vmlaq_f32(gx2, gx, gx);
gy2 = vmlaq_f32(gy2, gy, gy);
gxgy = vmlaq_f32(gxgy, gx, gy);
- /* R1 values */
+ // R1 values
gx = vextq_f32(low_gx, high_gx, 3);
gy = vextq_f32(low_gy, high_gy, 3);
- /* Accumulate */
+ // Accumulate
gx2 = vmlaq_f32(gx2, gx, gx);
gy2 = vmlaq_f32(gy2, gy, gy);
gxgy = vmlaq_f32(gxgy, gx, gy);
- /* R2 values */
+ // R2 values
gx = high_gx;
gy = high_gy;
- /* Accumulate */
+ // Accumulate
gx2 = vmlaq_f32(gx2, gx, gx);
gy2 = vmlaq_f32(gy2, gy, gy);
gxgy = vmlaq_f32(gxgy, gx, gy);
@@ -435,81 +480,81 @@
inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
{
- /* Normalize */
+ // Normalize
low_gx = vmulq_f32(low_gx, norm_factor);
low_gy = vmulq_f32(low_gy, norm_factor);
high_gx = vmulq_f32(high_gx, norm_factor);
high_gy = vmulq_f32(high_gy, norm_factor);
- /* L3 values */
+ // L3 values
float32x4_t gx = low_gx;
float32x4_t gy = low_gy;
- /* Accumulate */
+ // Accumulate
gx2 = vmlaq_f32(gx2, gx, gx);
gy2 = vmlaq_f32(gy2, gy, gy);
gxgy = vmlaq_f32(gxgy, gx, gy);
- /* L2 values */
+ // L2 values
gx = vextq_f32(low_gx, high_gx, 1);
gy = vextq_f32(low_gy, high_gy, 1);
- /* Accumulate */
+ // Accumulate
gx2 = vmlaq_f32(gx2, gx, gx);
gy2 = vmlaq_f32(gy2, gy, gy);
gxgy = vmlaq_f32(gxgy, gx, gy);
- /* L1 values */
+ // L1 values
gx = vextq_f32(low_gx, high_gx, 2);
gy = vextq_f32(low_gy, high_gy, 2);
- /* Accumulate */
+ // Accumulate
gx2 = vmlaq_f32(gx2, gx, gx);
gy2 = vmlaq_f32(gy2, gy, gy);
gxgy = vmlaq_f32(gxgy, gx, gy);
- /* M values */
+ // M values
gx = vextq_f32(low_gx, high_gx, 3);
gy = vextq_f32(low_gy, high_gy, 3);
- /* Accumulate */
+ // Accumulate
gx2 = vmlaq_f32(gx2, gx, gx);
gy2 = vmlaq_f32(gy2, gy, gy);
gxgy = vmlaq_f32(gxgy, gx, gy);
- /* R1 values */
+ // R1 values
gx = high_gx;
gy = high_gy;
- /* Accumulate */
+ // Accumulate
gx2 = vmlaq_f32(gx2, gx, gx);
gy2 = vmlaq_f32(gy2, gy, gy);
gxgy = vmlaq_f32(gxgy, gx, gy);
- /* Change tmp_low and tmp_high for calculating R2 and R3 values */
+ // Change tmp_low and tmp_high for calculating R2 and R3 values
low_gx = high_gx;
low_gy = high_gy;
high_gx = high_gx1;
high_gy = high_gy1;
- /* Normalize */
+ // Normalize
high_gx = vmulq_f32(high_gx, norm_factor);
high_gy = vmulq_f32(high_gy, norm_factor);
- /* R2 values */
+ // R2 values
gx = vextq_f32(low_gx, high_gx, 1);
gy = vextq_f32(low_gy, high_gy, 1);
- /* Accumulate */
+ // Accumulate
gx2 = vmlaq_f32(gx2, gx, gx);
gy2 = vmlaq_f32(gy2, gy, gy);
gxgy = vmlaq_f32(gxgy, gx, gy);
- /* R3 values */
+ // R3 values
gx = vextq_f32(low_gx, high_gx, 2);
gy = vextq_f32(low_gy, high_gy, 2);
- /* Accumulate */
+ // Accumulate
gx2 = vmlaq_f32(gx2, gx, gx);
gy2 = vmlaq_f32(gy2, gy, gy);
gxgy = vmlaq_f32(gxgy, gx, gy);
@@ -525,7 +570,7 @@
const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
const auto output = static_cast<float *__restrict>(output_ptr);
- /* Gx^2, Gy^2 and Gx*Gy */
+ // Gx^2, Gy^2 and Gx*Gy
float32x4x2_t gx2 =
{
{
@@ -548,7 +593,7 @@
}
};
- /* Row0 */
+ // Row0
int16x8x2_t tmp_gx =
{
{
@@ -579,7 +624,7 @@
high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
- /* Row1 */
+ // Row1
tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
@@ -597,7 +642,7 @@
high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
- /* Row2 */
+ // Row2
tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
@@ -615,7 +660,7 @@
high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
- /* Calculate harris score */
+ // Calculate harris score
const float32x4x2_t mc =
{
{
@@ -624,7 +669,7 @@
}
};
- /* Store score */
+ // Store score
vst1q_f32(output + 0, mc.val[0]);
vst1q_f32(output + 4, mc.val[1]);
}
@@ -643,7 +688,7 @@
float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
- /* Gx^2, Gy^2 and Gx*Gy */
+ // Gx^2, Gy^2 and Gx*Gy
float32x4x2_t gx2 =
{
{
@@ -666,7 +711,7 @@
}
};
- /* Row0 */
+ // Row0
float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
@@ -679,7 +724,7 @@
high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
- /* Row1 */
+ // Row1
low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
@@ -692,7 +737,7 @@
high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
- /* Row2 */
+ // Row2
low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
@@ -705,7 +750,7 @@
high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
- /* Calculate harris score */
+ // Calculate harris score
const float32x4x2_t mc =
{
{
@@ -714,7 +759,7 @@
}
};
- /* Store score */
+ // Store score
vst1q_f32(output + 0, mc.val[0]);
vst1q_f32(output + 4, mc.val[1]);
}
@@ -728,7 +773,7 @@
const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
const auto output = static_cast<float *__restrict>(output_ptr);
- /* Gx^2, Gy^2 and Gx*Gy */
+ // Gx^2, Gy^2 and Gx*Gy
float32x4x2_t gx2 =
{
{
@@ -783,14 +828,14 @@
high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
- /* Update gx and gy pointer */
+ // Update gx and gy pointer
gx_ptr_0 += input_stride;
gy_ptr_0 += input_stride;
gx_ptr_1 += input_stride;
gy_ptr_1 += input_stride;
}
- /* Calculate harris score */
+ // Calculate harris score
const float32x4x2_t mc =
{
{
@@ -799,7 +844,7 @@
}
};
- /* Store score */
+ // Store score
vst1q_f32(output + 0, mc.val[0]);
vst1q_f32(output + 4, mc.val[1]);
}
@@ -816,7 +861,7 @@
const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
const auto output = static_cast<float *__restrict>(output_ptr);
- /* Gx^2, Gy^2 and Gx*Gy */
+ // Gx^2, Gy^2 and Gx*Gy
float32x4x2_t gx2 =
{
{
@@ -856,7 +901,7 @@
const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
- /* Update gx and gy pointer */
+ // Update gx and gy pointer
gx_ptr_0 += input_stride;
gy_ptr_0 += input_stride;
gx_ptr_1 += input_stride;
@@ -865,7 +910,7 @@
gy_ptr_2 += input_stride;
}
- /* Calculate harris score */
+ // Calculate harris score
const float32x4x2_t mc =
{
{
@@ -874,7 +919,7 @@
}
};
- /* Store score */
+ // Store score
vst1q_f32(output + 0, mc.val[0]);
vst1q_f32(output + 4, mc.val[1]);
}
@@ -888,7 +933,7 @@
const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
const auto output = static_cast<float *__restrict>(output_ptr);
- /* Gx^2, Gy^2 and Gx*Gy */
+ // Gx^2, Gy^2 and Gx*Gy
float32x4_t gx2 = vdupq_n_f32(0.0f);
float32x4_t gy2 = vdupq_n_f32(0.0f);
float32x4_t gxgy = vdupq_n_f32(0.0f);
@@ -911,17 +956,17 @@
float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
- /* Update gx and gy pointer */
+ // Update gx and gy pointer
gx_ptr_0 += input_stride;
gy_ptr_0 += input_stride;
gx_ptr_1 += input_stride;
gy_ptr_1 += input_stride;
}
- /* Calculate harris score */
+ // Calculate harris score
const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
- /* Store score */
+ // Store score
vst1q_f32(output, mc);
}
@@ -936,7 +981,7 @@
const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
const auto output = static_cast<float *__restrict>(output_ptr);
- /* Gx^2, Gy^2 and Gx*Gy */
+ // Gx^2, Gy^2 and Gx*Gy
float32x4_t gx2 = vdupq_n_f32(0.0f);
float32x4_t gy2 = vdupq_n_f32(0.0f);
float32x4_t gxgy = vdupq_n_f32(0.0f);
@@ -954,7 +999,7 @@
const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
- /* Update gx and gy pointer */
+ // Update gx and gy pointer
gx_ptr_0 += input_stride;
gy_ptr_0 += input_stride;
gx_ptr_1 += input_stride;
@@ -963,10 +1008,10 @@
gy_ptr_2 += input_stride;
}
- /* Calculate harris score */
+ // Calculate harris score
const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
- /* Store score */
+ // Store score
vst1q_f32(output, mc);
}
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 4406eac..c7c23d5 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
@@ -38,6 +39,112 @@
using namespace arm_compute;
+namespace
+{
+template <typename T, bool has_pads>
+inline void linearize_volume(const uint8_t *const in_ptr,
+ T *out_ptr,
+ bool has_bias,
+ int top_left_x,
+ int top_left_y,
+ int kernel_size,
+ int kernel_depth,
+ int input_w,
+ int input_h,
+ int input_stride_x,
+ int input_stride_y,
+ int input_stride_z,
+ int fixed_point_position)
+{
+ const int kernel_size2 = kernel_size * kernel_size;
+ const int x_e = top_left_x + kernel_size;
+ const int y_e = top_left_y + kernel_size;
+
+ // Linearize volume
+ int d = 0;
+ // This for loop linearize a volume with 3 slices. This allows:
+ // 1) to reduce the iterations of the outer for loop "d"
+ // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
+ for(; d <= (kernel_depth - 3); d += 3)
+ {
+ for(int y = top_left_y; y < y_e; ++y)
+ {
+ if((y < 0 || y >= input_h) && has_pads)
+ {
+ // All the values will be zeros
+ for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+ {
+ *(out_ptr + 0 * kernel_size2) = 0;
+ *(out_ptr + 1 * kernel_size2) = 0;
+ *(out_ptr + 2 * kernel_size2) = 0;
+ }
+ }
+ else
+ {
+ for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+ {
+ if((x < 0 || x >= input_w) && has_pads)
+ {
+ *(out_ptr + 0 * kernel_size2) = 0;
+ *(out_ptr + 1 * kernel_size2) = 0;
+ *(out_ptr + 2 * kernel_size2) = 0;
+ }
+ else
+ {
+ *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+ *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+ *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+ }
+ }
+ }
+ }
+ out_ptr += 2 * kernel_size2;
+ }
+
+ // Left over
+ for(; d < kernel_depth; d++)
+ {
+ for(int y = top_left_y; y < y_e; ++y)
+ {
+ if((y < 0 || y >= input_h) && has_pads)
+ {
+ // All the values will be zeros
+ memset(out_ptr, 0, kernel_size * sizeof(T));
+ out_ptr += kernel_size;
+ }
+ else
+ {
+ for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+ {
+ if((x < 0 || x >= input_w) && has_pads)
+ {
+ *out_ptr = 0;
+ }
+ else
+ {
+ *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
+ }
+ }
+ }
+ }
+ }
+
+ // Append 1 if the convolution layer has biases
+ if(has_bias)
+ {
+ if(std::is_same<T, arm_compute::qint8_t>::value)
+ {
+ *out_ptr = scvt_qs8_f32(1.0f, fixed_point_position);
+ }
+ else
+ {
+ *out_ptr = static_cast<T>(1);
+ }
+ }
+}
+} // namespace
+
+template <typename T, bool has_pads>
void NEIm2ColKernel::run_generic(const Window &window)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -84,36 +191,27 @@
// Get pointers
const uint8_t *const input_ptr = in.ptr();
- auto output_ptr = reinterpret_cast<float *>(out.ptr());
+ auto output_ptr = reinterpret_cast<T *>(out.ptr());
// Linearize volume
- for(int d = 0; d < kernel_depth; ++d)
- {
- for(int y = top_left_y, y_e = top_left_y + static_cast<int>(_kernel_size); y < y_e; ++y)
- {
- for(int x = top_left_x, x_e = top_left_x + static_cast<int>(_kernel_size); x < x_e; ++x, ++output_ptr)
- {
- if(x < 0 || x >= input_w || y < 0 || y >= input_h)
- {
- *output_ptr = 0.f;
- }
- else
- {
- *output_ptr = *(reinterpret_cast<const float *>(input_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
- }
- }
- }
- }
-
- // Add bias
- if(_has_bias)
- {
- *output_ptr = 1;
- }
+ linearize_volume<T, has_pads>(input_ptr,
+ output_ptr,
+ _has_bias,
+ top_left_x,
+ top_left_y,
+ static_cast<int>(_kernel_size),
+ kernel_depth,
+ input_w,
+ input_h,
+ input_stride_x,
+ input_stride_y,
+ input_stride_z,
+ _input->info()->fixed_point_position());
},
in, out);
}
+template <typename T>
void NEIm2ColKernel::run_reduced(const Window &window)
{
const size_t in_width = _input->info()->dimension(0);
@@ -148,7 +246,14 @@
// Add bias
if(_has_bias)
{
- *(reinterpret_cast<float *>(out_ptr) + out_width - 1) = 1.0f;
+ if(std::is_same<T, arm_compute::qint8_t>::value)
+ {
+ *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = scvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
+ }
+ else
+ {
+ *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = static_cast<T>(1);
+ }
}
}
while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
@@ -161,8 +266,9 @@
void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
_input = input;
_output = output;
@@ -185,11 +291,33 @@
if(run_img2col_reduced)
{
- _func = &NEIm2ColKernel::run_reduced;
+ switch(_input->info()->data_type())
+ {
+ case DataType::F32:
+ _func = &NEIm2ColKernel::run_reduced<float>;
+ break;
+ case DataType::QS8:
+ _func = &NEIm2ColKernel::run_reduced<qint8_t>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
}
else
{
- _func = &NEIm2ColKernel::run_generic;
+ switch(_input->info()->data_type())
+ {
+ case DataType::F32:
+ _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<float, false> : &NEIm2ColKernel::run_generic<float, true>;
+ break;
+ case DataType::QS8:
+ _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<qint8_t, false> : &NEIm2ColKernel::run_generic<qint8_t, true>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
window.set(Window::DimX, Window::Dimension(0, _convolved_dims.first, 1));
window.set(Window::DimY, Window::Dimension(0, _convolved_dims.second, 1));
window.set(Window::DimZ, Window::Dimension(0, 1, 1));
diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.cpp b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
index 8651c83..3b09a1b 100644
--- a/src/core/NEON/kernels/NEIntegralImageKernel.cpp
+++ b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
@@ -40,9 +40,25 @@
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+ _input = input;
+ _output = output;
+
constexpr unsigned int num_elems_processed_per_iteration = 16;
- INESimpleKernel::configure(input, output, num_elems_processed_per_iteration);
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ // The kernel is effectively reading 17 values from -1 as it loads 16
+ // starting at -1 and also 16 starting at 0
+ AccessWindowRectangle output_read_access(output->info(), -1, -1, num_elems_processed_per_iteration + 1, 1);
+ AccessWindowHorizontal output_write_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win,
+ AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+ output_read_access, output_write_access);
+
+ output_write_access.set_valid_region(win, input->info()->valid_region());
+
+ IKernel::configure(win);
}
BorderSize NEIntegralImageKernel::border_size() const
diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000..ab84efb
--- /dev/null
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window)
+{
+ const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));
+ const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+ const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
+
+ // The implementation computes 16 elements per iteration
+ const int window_start_x = 16 * window.thread_id();
+ const int window_step_x = 16 * window.num_threads();
+ // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+ const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+ Window win_out(window);
+ win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+
+ Window win_a(window);
+ win_a.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator ina(input0, win_a);
+ Iterator out(output, win_out);
+
+ execute_window_loop(win_out, [&](const Coordinates & id)
+ {
+ if(id.x() > width_matrix_b)
+ {
+ return;
+ }
+
+ float32x4_t acc0 = vdupq_n_f32(0.f);
+ float32x4_t acc1 = vdupq_n_f32(0.f);
+ float32x4_t acc2 = vdupq_n_f32(0.f);
+ float32x4_t acc3 = vdupq_n_f32(0.f);
+
+ auto vec_a = reinterpret_cast<const float *>(ina.ptr());
+ auto matrix_b = reinterpret_cast<const float *>(input1->ptr_to_element(Coordinates(id[0], 0, id[1])));
+
+#if __arm__
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+#endif
+
+ const float *vec_a_end_addr = vec_a + num_elems_vec_a;
+
+ for(; vec_a <= (vec_a_end_addr - 4);)
+ {
+ float32x2_t a0l = vld1_f32(vec_a);
+
+ float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+ float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+ float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+ float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+ float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+ float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+ float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+ float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+#if __arm__
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+ asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+ asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+ asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+ asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+#endif
+
+ acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+ acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+ acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+ acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+ acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+ acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+ acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+ acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+ vec_a += 2;
+ matrix_b += 2 * in_b_stride;
+
+ a0l = vld1_f32(vec_a);
+
+ b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+ b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+ b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+ b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+ b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+ b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+ b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+ b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+ acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+ acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+ acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+ acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+ acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+ acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+ acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+ acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+ vec_a += 2;
+ matrix_b += 2 * in_b_stride;
+ }
+
+ for(; vec_a < vec_a_end_addr;)
+ {
+ const float a0 = *vec_a;
+
+ const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+ const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+ const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+ const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+ acc0 = vmlaq_n_f32(acc0, b00, a0);
+ acc1 = vmlaq_n_f32(acc1, b01, a0);
+ acc2 = vmlaq_n_f32(acc2, b02, a0);
+ acc3 = vmlaq_n_f32(acc3, b03, a0);
+
+ vec_a += 1;
+ matrix_b += in_b_stride;
+ }
+
+ const auto vec_out = reinterpret_cast<float *>(out.ptr());
+
+ vst1q_f32(vec_out + 0, acc0);
+ vst1q_f32(vec_out + 4, acc1);
+ vst1q_f32(vec_out + 8, acc2);
+ vst1q_f32(vec_out + 12, acc3);
+ },
+ ina, out);
+}
+} // namespace
+
+NELocallyConnectedMatrixMultiplyKernel::NELocallyConnectedMatrixMultiplyKernel()
+ : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void NELocallyConnectedMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+
+ unsigned int num_elems_processed_per_iteration_x = 16;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x);
+
+ update_window_and_padding(win,
+ AccessWindowHorizontal(input0->info(), 0, num_elems_processed_per_iteration_x),
+ AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
+ output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+ INEKernel::configure(win);
+}
+
+void NELocallyConnectedMatrixMultiplyKernel::run(const Window &window)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ vector_matrix_multiply_f32(_input0, _input1, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
index 576bf5c..b188614 100644
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
@@ -36,8 +36,8 @@
#include <climits>
#include <cstddef>
-using namespace arm_compute;
-
+namespace arm_compute
+{
NEMinMaxKernel::NEMinMaxKernel()
: _func(), _input(nullptr), _min(), _max(), _min_init(), _max_init(), _mtx()
{
@@ -190,7 +190,6 @@
return false;
}
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
template <unsigned int...>
struct index_seq
{
@@ -210,10 +209,7 @@
{
using type = index_seq<S...>;
};
-#endif /* DOXYGEN_SKIP_THIS */
-namespace arm_compute
-{
template <class T, unsigned int... N>
struct NEMinMaxLocationKernel::create_func_table<T, index_seq<N...>>
{
@@ -225,7 +221,6 @@
{
&NEMinMaxLocationKernel::minmax_loc<T, bool(N & 8), bool(N & 4), bool(N & 2), bool(N & 1)>...
};
-} // namespace arm_compute
void NEMinMaxLocationKernel::configure(const IImage *input, int32_t *min, int32_t *max,
ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc,
@@ -363,3 +358,4 @@
}
}
}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
index 4ca5dca..03d1409 100644
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
@@ -36,8 +36,8 @@
#include <tuple>
#include <utility>
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
const uint8x16_t zero_u8 = vdupq_n_u8(0);
@@ -419,9 +419,6 @@
}
}
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
-namespace arm_compute
-{
template <>
void NENonLinearFilterKernel::median_filter_box<3, 3>(const Window &win)
{
@@ -507,8 +504,6 @@
},
input, output);
}
-} // namespace arm_compute
-#endif
template <int mask_w, int mask_h>
void NENonLinearFilterKernel::min_filter_box(const Window &win)
@@ -588,9 +583,6 @@
input, output);
}
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
-namespace arm_compute
-{
template <>
void NENonLinearFilterKernel::median_filter_cross<3, 3>(const Window &win)
{
@@ -656,8 +648,6 @@
},
input, output);
}
-} // namespace arm_compute
-#endif
template <int mask_w, int mask_h>
void NENonLinearFilterKernel::min_filter_cross(const Window &win)
@@ -751,9 +741,6 @@
input, output);
}
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
-namespace arm_compute
-{
template <>
void NENonLinearFilterKernel::median_filter_disk<5, 5>(const Window &win)
{
@@ -878,8 +865,6 @@
},
input, output);
}
-} // namespace arm_compute
-#endif
template <int mask_w, int mask_h>
void NENonLinearFilterKernel::non_linear_filter_generic(const Window &win)
@@ -1021,3 +1006,4 @@
break;
}
}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 90ce0e5..a971dc8 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -23,7 +23,9 @@
*/
#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
+#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/NEON/NEMath.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
@@ -33,7 +35,7 @@
using namespace arm_compute;
NENormalizationLayerKernel::NENormalizationLayerKernel()
- : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr), _norm_info(NormType::IN_MAP), _border_size()
+ : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr), _norm_info(NormType::IN_MAP_1D), _border_size()
{
}
@@ -44,28 +46,51 @@
void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared, output);
ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
+ ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
+ ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
+ ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
- const unsigned int border_width = (norm_info.type() == NormType::IN_MAP) ? 3 : 0;
+ const unsigned int border_width = (norm_info.type() == NormType::CROSS_MAP) ? 0 : std::min(norm_info.norm_size() / 2, 3U);
_input = input;
_input_squared = input_squared;
_output = output;
_norm_info = norm_info;
- _func = (norm_info.type() == NormType::IN_MAP) ? &NENormalizationLayerKernel::normalize<0> : &NENormalizationLayerKernel::normalize<2>;
_border_size = BorderSize(0, border_width);
- constexpr unsigned int num_elems_processed_per_iteration = 4;
- const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+ const bool is_dt_f32 = _input->info()->data_type() == DataType::F32;
+
+ switch(norm_info.type())
+ {
+ case NormType::IN_MAP_1D:
+ _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, false> : &NENormalizationLayerKernel::normalize_fixed_point<0, false>;
+ break;
+ case NormType::IN_MAP_2D:
+ // Normalize over X and Y
+ _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, true> : &NENormalizationLayerKernel::normalize_fixed_point<0, true>;
+ break;
+ case NormType::CROSS_MAP:
+ _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<2, false> : &NENormalizationLayerKernel::normalize_fixed_point<2, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+ }
+
+ const unsigned int num_elems_processed_per_iteration = (is_dt_f32) ? 4 : 16;
+ const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+ const unsigned int num_rows = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
// Configure window
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration);
- AccessWindowHorizontal input_squared_access(input_squared->info(), -_border_size.left, num_elems_read_per_iteration);
+ AccessWindowRectangle input_access(input->info(), -_border_size.left, 0, num_elems_read_per_iteration, num_rows);
+ AccessWindowRectangle input_squared_access(input_squared->info(), -_border_size.left, 0, num_elems_read_per_iteration, num_rows);
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
update_window_and_padding(win, input_access, input_squared_access, output_access);
@@ -75,19 +100,22 @@
INEKernel::configure(win);
}
-template <unsigned int dim>
+template <unsigned int dim, bool do_2D_norm>
void NENormalizationLayerKernel::normalize(const Window &window)
{
Iterator input(_input, window);
Iterator input_squared(_input_squared, window);
Iterator output(_output, window);
+ const int dim_y = 1;
const int radius = _norm_info.norm_size() / 2;
const int total_size = _input->info()->dimension(dim) - 1;
const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
- // We account padding when we normalize across X
- const int min_left = (dim == 0) ? -static_cast<int>(border_size().left) : 0;
- const int max_right = (dim == 0) ? total_size + border_size().left : total_size;
+ // We account padding across X only and we iterate over rows
+ const int min_left = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
+ const int max_right = (dim == 2) ? total_size : total_size + border_size().left;
+ const int min_top = 0;
+ const int max_bottom = _input->info()->dimension(dim_y) - 1;
const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
const float32x4_t beta_vec = vdupq_n_f32(_norm_info.beta());
@@ -96,25 +124,89 @@
execute_window_loop(window, [&](const Coordinates & id)
{
// Get range to normalize
+ const int current_row = do_2D_norm ? id[dim_y] : 0;
const int current_slice = id[dim];
+ const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+ const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
const int first_slice = std::max(current_slice - radius, min_left);
const int last_slice = std::min(current_slice + radius, max_right);
- // Accumulate cross map values
+ // Accumulate 2D In-Map values
float32x4_t accu = vdupq_n_f32(0.f);
- for(int i = first_slice; i <= last_slice; ++i)
+ for(int j = first_row; j <= last_row; j++)
{
- accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<float *>(input_squared.ptr() + (i - current_slice) * input_squared_stride)));
+ // Compute row displacement
+ const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+ const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+ for(int i = first_slice; i <= last_slice; ++i)
+ {
+ accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(input_squared_ptr + i * input_squared_stride)));
+ }
}
// Normalize
const float32x4_t normalized = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
- const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), vinvq_f32(normalized));
+ const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
},
input, input_squared, output);
}
+template <unsigned int dim, bool do_2D_norm>
+void NENormalizationLayerKernel::normalize_fixed_point(const Window &window)
+{
+ Iterator input(_input, window);
+ Iterator input_squared(_input_squared, window);
+ Iterator output(_output, window);
+
+ const int dim_y = 1;
+ const int radius = _norm_info.norm_size() / 2;
+ const int total_size = _input->info()->dimension(dim) - 1;
+ const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
+ // We account padding across X only and we iterate over rows
+ const int min_left = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
+ const int max_right = (dim == 2) ? total_size : total_size + border_size().left;
+ const int min_top = 0;
+ const int max_bottom = _input->info()->dimension(dim_y) - 1;
+
+ const int fixed_point_position = _input->info()->fixed_point_position();
+
+ const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position);
+ const qint8x16_t beta_vec = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position);
+ const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get range to normalize
+ const int current_row = do_2D_norm ? id[dim_y] : 0;
+ const int current_slice = id[dim];
+ const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+ const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+ const int first_slice = std::max(current_slice - radius, min_left);
+ const int last_slice = std::min(current_slice + radius, max_right);
+
+ // Accumulate 2D In-Map values
+ qint8x16_t accu = vdupq_n_qs8(0);
+ for(int j = first_row; j <= last_row; ++j)
+ {
+ // Compute row displacement
+ const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+ const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+ for(int i = first_slice; i <= last_slice; ++i)
+ {
+ accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast<const qint8_t *>(input_squared_ptr + i * input_squared_stride)));
+ }
+ }
+
+ // Normalize
+ const qint8x16_t accu_scale = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position);
+ const qint8x16_t normalized = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position);
+ const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), normalized, fixed_point_position);
+ vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), normalized_pixel);
+ },
+ input, input_squared, output);
+}
+
void NENormalizationLayerKernel::run(const Window &window)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index 6b7510f..aa8c7a1 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
@@ -61,7 +62,6 @@
{
// Scale
const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
-
// Round to nearest (round half up)
// Add +0.5 for all values
// Afterwards vcvt rounds toward zero
@@ -125,6 +125,25 @@
}
template <bool is_scale255, bool is_sat>
+void mul_QS8_QS8_QS8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
+{
+ // n is the exponent of the scaling factor, that is scale = 1/2^n. Currently, we only support scaling factor equal to 1 => n = 0.
+ ARM_COMPUTE_ERROR_ON_MSG(n != 0, "Scaling factor different than 1 not supported for 8-bit fixed-point pixel-wise multiplication");
+ ARM_COMPUTE_UNUSED(n);
+
+ const auto input1 = static_cast<const qint8_t *__restrict>(input1_ptr);
+ const auto input2 = static_cast<const qint8_t *__restrict>(input2_ptr);
+ const auto output = static_cast<qint8_t *__restrict>(output_ptr);
+
+ const qint8x16_t ta1 = vld1q_qs8(input1);
+ const qint8x16_t ta2 = vld1q_qs8(input2);
+
+ qint8x16_t res = (is_sat) ? vqmulq_qs8(ta1, ta2, fixed_point_position) : vmulq_qs8(ta1, ta2, fixed_point_position);
+
+ vst1q_s8(output, res);
+}
+
+template <bool is_scale255, bool is_sat>
inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n)
{
int32x4_t tmp1_high = vmovl_s16(vget_high_s16(input1));
@@ -142,17 +161,28 @@
}
else
{
+ // Right shift amount
const int32x4_t vn = vdupq_n_s32(-n);
-
+ // Left shift amount
+ const int32x4_t vnl = vdupq_n_s32(n);
+ // Calculate conversion bit
+ const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
+ const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
+ const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
+ const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
+ const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
+ const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
+ const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
+ const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
if(is_sat)
{
- tmp1_high = vqshlq_s32(tmp1_high, vn);
- tmp1_low = vqshlq_s32(tmp1_low, vn);
+ tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
+ tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
}
else
{
- tmp1_high = vshlq_s32(tmp1_high, vn);
- tmp1_low = vshlq_s32(tmp1_low, vn);
+ tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
+ tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
}
}
@@ -297,17 +327,23 @@
} // namespace
NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
- : _func_float(nullptr), _func_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
+ : _func_float(nullptr), _func_int(nullptr), _func_q_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
{
}
void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
"Output can only be U8 if both inputs are U8");
+ if(output->info()->data_type() == DataType::QS8 || input1->info()->data_type() == DataType::QS8 || output->info()->data_type() == DataType::QS8)
+ {
+ // All data types must be QS8
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input1, input2, output);
+ }
_input1 = input1;
_input2 = input2;
@@ -315,13 +351,14 @@
_scale = scale;
_scale_exponent = 0;
_func_int = nullptr;
+ _func_q_int = nullptr;
_func_float = nullptr;
bool is_scale_255 = false;
// Check and validate scaling factor
if(std::abs(scale - scale255_constant) < 0.00001f)
{
- ARM_COMPUTE_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
+ ARM_COMPUTE_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
ARM_COMPUTE_UNUSED(rounding_policy);
is_scale_255 = true;
@@ -409,6 +446,17 @@
_func_int = is_sat ? &mul_U8_U8_S16_n<false, true> : &mul_U8_U8_S16_n<false, false>;
}
}
+ else if(DataType::QS8 == dt_input1 && DataType::QS8 == dt_input2 && DataType::QS8 == dt_output)
+ {
+ if(is_scale_255)
+ {
+ _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<true, true> : &mul_QS8_QS8_QS8_n<true, false>;
+ }
+ else
+ {
+ _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<false, true> : &mul_QS8_QS8_QS8_n<false, false>;
+ }
+ }
else if(DataType::F32 == dt_input1 && DataType::F32 == dt_input2 && DataType::F32 == dt_output)
{
_func_float = &mul_F32_F32_F32_n<false, false>;
@@ -455,6 +503,15 @@
},
input1, input2, output);
}
+ else if(_func_q_int != nullptr)
+ {
+ int fixed_point_position = _input1->info()->fixed_point_position();
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ (*_func_q_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent, fixed_point_position);
+ },
+ input1, input2, output);
+ }
else
{
ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 43d8743..30b67b6 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -25,8 +25,10 @@
#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
@@ -51,10 +53,23 @@
int end_y = std::min(start_y + pool_size, upper_bound_h);
return 1.f / ((end_y - start_y) * (end_x - start_x));
}
+
+inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
+ int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
+{
+ static std::array<qint8_t, 10> scale_values_q8 =
+ { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
+ const int start_x = id.x() * stride_x - pad_x;
+ const int start_y = id.y() * stride_y - pad_y;
+ const int end_x = std::min(start_x + pool_size, upper_bound_w);
+ const int end_y = std::min(start_y + pool_size, upper_bound_h);
+ const int val = ((end_y - start_y) * (end_x - start_x));
+ return scale_values_q8[val] >> (7 - fixed_point_position);
+}
} // namespace
NEPoolingLayerKernel::NEPoolingLayerKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _border_size(0)
+ : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
{
}
@@ -78,11 +93,14 @@
std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size);
ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
+ ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_type == PoolingType::AVG && input->info()->fixed_point_position() > 6);
+ ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_stride_x > 2);
// Check output dimensions
std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
@@ -92,11 +110,33 @@
ARM_COMPUTE_UNUSED(pooled_h);
ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
- const int num_elems_read_per_iteration = (pool_size == 2) ? 2 : 4; // We use vload4 for pooling3
- const int input_width = input->info()->dimension(0);
- const int input_height = input->info()->dimension(1);
- const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
- const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+ unsigned int num_elems_read_per_iteration = 0;
+ unsigned int num_elems_processed_per_iteration = 0;
+ unsigned int num_elems_horizontal_window = 0;
+
+ // Select element size
+ switch(input->info()->data_type())
+ {
+ case DataType::QS8:
+ num_elems_read_per_iteration = 16;
+ num_elems_processed_per_iteration = (pool_size == 2) ? 8 : 7;
+ num_elems_horizontal_window = 8;
+ break;
+ case DataType::F32:
+ num_elems_read_per_iteration = (pool_size == 2) ? 2 : 4; // We use vload4 for pooling3
+ num_elems_processed_per_iteration = 1;
+ num_elems_horizontal_window = 1;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Element size not supported");
+ break;
+ }
+
+ _num_elems_processed_per_iteration = num_elems_processed_per_iteration;
+ const int input_width = input->info()->dimension(0);
+ const int input_height = input->info()->dimension(1);
+ const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
+ const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
// Set instance variables
_input = input;
@@ -110,10 +150,24 @@
switch(pool_size)
{
case 2:
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2<PoolingType::MAX>;
+ if(input->info()->data_type() == DataType::QS8)
+ {
+ _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+ }
+ else if(input->info()->data_type() == DataType::F32)
+ {
+ _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
+ }
break;
case 3:
- _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3<PoolingType::MAX>;
+ if(input->info()->data_type() == DataType::QS8)
+ {
+ _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+ }
+ else if(input->info()->data_type() == DataType::F32)
+ {
+ _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
+ }
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling size");
@@ -121,18 +175,61 @@
}
// Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 1;
-
Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
- AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_horizontal_window);
update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
INEKernel::configure(win);
}
template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling2(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ const int fixed_point_position = _input->info()->fixed_point_position();
+ constexpr int pool_size = 2;
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+ const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+ const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+ const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
+ const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
+ qint8x8_t res = {};
+ if(pooling_type == PoolingType::AVG)
+ {
+ // Calculate scale
+ const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+ const qint8x8_t scale_vec = vdup_n_qs8(scale);
+
+ // Perform pooling
+ const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
+ res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
+ }
+ else
+ {
+ const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
+ res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
+ }
+ vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
+ },
+ input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -173,7 +270,80 @@
}
template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling3(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ const int fixed_point_position = _input->info()->fixed_point_position();
+ constexpr int pool_size = 3;
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+ const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+ const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+ const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+ const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
+ const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
+ const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
+ qint8x8_t res = {};
+ if(pooling_type == PoolingType::AVG)
+ {
+ // Calculate scale
+ const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+ const qint8x8_t scale_vec = vdup_n_qs8(scale);
+
+ // Perform pooling for stride 2
+ const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
+ const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
+ const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
+ const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
+ if(pool_stride_x == 2)
+ {
+ const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
+ static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
+ res = vtbl2_s8(table, lookup_val);
+ }
+ else
+ {
+ res = vget_low_s8(final_sum);
+ }
+ res = vqmul_qs8(res, scale_vec, fixed_point_position);
+ }
+ else
+ {
+ const qint8x16_t max_data = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
+ const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
+ const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
+ const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
+
+ if(pool_stride_x == 2)
+ {
+ const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
+ static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
+ res = vtbl2_s8(table, lookup_val);
+ }
+ else
+ {
+ res = vget_low_s8(final_max);
+ }
+ }
+ vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
+ },
+ input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
@@ -227,8 +397,17 @@
std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
// Set step for input in x and y direction for the input
- Window window_input(window);
- window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, pool_stride_x));
+ Window window_input(window);
+ unsigned int window_x_inc = 0;
+ if(_input->info()->data_type() == DataType::QS8)
+ {
+ window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+ }
+ else
+ {
+ window_x_inc = pool_stride_x;
+ }
+ window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
// Run function
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index a6e6cad..942662e 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -28,6 +28,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/NEON/NEMath.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
@@ -40,8 +41,75 @@
using namespace arm_compute;
+namespace
+{
+void logits_1d_max_f32(const ITensor *in, ITensor *out, const Window &window)
+{
+ Window in_slice = window.first_slice_window_1D();
+
+ Window window_max(window);
+ window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+ Window max_slice = window_max.first_slice_window_1D();
+
+ do
+ {
+ Iterator input(in, in_slice);
+ Iterator output(out, max_slice);
+
+ float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const float *>(input.ptr());
+ const float32x4_t current_value = vld1q_f32(in_ptr);
+ vec_max = vmaxq_f32(vec_max, current_value);
+ },
+ input);
+
+ float32x2_t carry_max = vpmax_f32(vget_high_f32(vec_max), vget_low_f32(vec_max));
+ carry_max = vpmax_f32(carry_max, carry_max);
+
+ *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(carry_max, 0);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+
+void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window)
+{
+ Window in_slice = window.first_slice_window_1D();
+
+ Window window_max(window);
+ window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+ Window max_slice = window_max.first_slice_window_1D();
+
+ do
+ {
+ Iterator input(in, in_slice);
+ Iterator output(out, max_slice);
+
+ qint8x16_t vec_max = vdupq_n_s8(-1);
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const qint8_t *>(input.ptr());
+ const qint8x16_t current_value = vld1q_qs8(in_ptr);
+ vec_max = vmaxq_qs8(vec_max, current_value);
+ },
+ input);
+
+ qint8x8_t carry_max = vpmax_qs8(vget_high_s8(vec_max), vget_low_s8(vec_max));
+ carry_max = vpmax_qs8(carry_max, carry_max);
+ carry_max = vpmax_qs8(carry_max, carry_max);
+ carry_max = vpmax_qs8(carry_max, carry_max);
+
+ *(reinterpret_cast<int8_t *>(output.ptr())) = vget_lane_s8(carry_max, 0);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+} // namespace
+
NELogits1DMaxKernel::NELogits1DMaxKernel()
- : _border_size()
+ : _func(nullptr), _border_size()
{
}
@@ -52,12 +120,26 @@
void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- const int input_width = input->info()->valid_region().shape.x();
- constexpr unsigned int num_elems_processed_per_iteration = 4;
+ const int input_width = input->info()->valid_region().shape.x();
+ unsigned int num_elems_processed_per_iteration = 0;
+
+ switch(input->info()->data_type())
+ {
+ case DataType::QS8:
+ _func = &logits_1d_max_qs8;
+ num_elems_processed_per_iteration = 16;
+ break;
+ case DataType::F32:
+ num_elems_processed_per_iteration = 4;
+ _func = &logits_1d_max_f32;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ }
_input = input;
_output = output;
@@ -81,60 +163,170 @@
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
- Window in_slice = window.first_slice_window_1D();
+ (*_func)(_input, _output, window);
+}
+namespace
+{
+void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+{
Window window_max(window);
window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+
Window max_slice = window_max.first_slice_window_1D();
+ Window in_slice = window.first_slice_window_1D();
+
+ constexpr int step = 4;
+ const int long_steps = in->info()->valid_region().shape.x() / step;
+ const int small_steps = in->info()->valid_region().shape.x() % step;
do
{
- Iterator in(_input, in_slice);
- Iterator out(_output, max_slice);
+ Iterator input(in, in_slice);
+ Iterator exp(out, in_slice);
+ Iterator _max(max, max_slice);
+ Iterator _sum(sum, max_slice);
- float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
+ // Get pointers
+ auto in_ptr = reinterpret_cast<const float *>(input.ptr());
+ auto exp_ptr = reinterpret_cast<float *>(exp.ptr());
- execute_window_loop(in_slice, [&](const Coordinates & id)
+ // Init sum to zero
+ float32x4_t vec_sum_value = vdupq_n_f32(0.0f);
+
+ // Get max value
+ const auto max_ptr = reinterpret_cast<const float *>(_max.ptr());
+ const float32x4_t vec_max = vdupq_n_f32(*max_ptr);
+
+ // Run neon loop
+ for(int i = 0; i < long_steps; ++i)
{
- const auto in_ptr = reinterpret_cast<const float *>(in.ptr());
- const float32x4_t current_value = vld1q_f32(in_ptr);
- vec_max = vmaxq_f32(vec_max, current_value);
- },
- in);
+ float32x4_t vec_elements = vld1q_f32(in_ptr);
+ vec_elements = vsubq_f32(vec_elements, vec_max);
+ vec_elements = vexpq_f32(vec_elements);
- float32x2_t carry_max = vpmax_f32(vget_high_f32(vec_max), vget_low_f32(vec_max));
- carry_max = vpmax_f32(carry_max, carry_max);
+ vst1q_f32(exp_ptr, vec_elements);
+ vec_sum_value = vaddq_f32(vec_elements, vec_sum_value);
- *(reinterpret_cast<float *>(out.ptr())) = vget_lane_f32(carry_max, 0);
+ in_ptr += step;
+ exp_ptr += step;
+ }
+
+ // Reduce sum
+ float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
+ carry_addition = vpadd_f32(carry_addition, carry_addition);
+ float sum = vget_lane_f32(carry_addition, 0);
+
+ // Run remaining elements
+ for(int i = 0; i < small_steps; ++i)
+ {
+ float element = std::exp(in_ptr[i] - *max_ptr);
+ exp_ptr[i] = element;
+ sum += element;
+ }
+
+ *(reinterpret_cast<float *>(_sum.ptr())) = sum;
}
while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
}
+void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+{
+ Window window_max(window);
+ window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ Window max_slice = window_max.first_slice_window_1D();
+ Window in_slice = window.first_slice_window_1D();
+
+ constexpr int step = 8;
+ const int long_steps = in->info()->valid_region().shape.x() / step;
+ const int small_steps = in->info()->valid_region().shape.x() % step;
+ const int fixed_point_position = in->info()->fixed_point_position();
+
+ do
+ {
+ Iterator input(in, in_slice);
+ Iterator exp(out, in_slice);
+ Iterator _max(max, max_slice);
+ Iterator _sum(sum, max_slice);
+
+ // Get pointers
+ auto in_ptr = reinterpret_cast<const qint8_t *>(input.ptr());
+ auto exp_ptr = reinterpret_cast<qint8_t *>(exp.ptr());
+
+ // Init sum to zero
+ qint16x8_t vec_sum_value = vdupq_n_qs16(0);
+
+ // Get max value
+ const auto max_ptr = reinterpret_cast<const qint8_t *>(_max.ptr());
+ const qint8x8_t vec_max = vdup_n_qs8(*max_ptr);
+
+ // Run neon loop
+ for(int i = 0; i < long_steps; ++i)
+ {
+ qint8x8_t vec_elements = vld1_qs8(in_ptr);
+ vec_elements = vqsub_qs8(vec_elements, vec_max);
+ vec_elements = vqexp_qs8(vec_elements, fixed_point_position);
+
+ vst1_qs8(exp_ptr, vec_elements);
+ vec_sum_value = vqaddq_qs16(vec_sum_value, vmovl_s8(vec_elements));
+
+ in_ptr += step;
+ exp_ptr += step;
+ }
+ // Reduce sum
+ const qint16x4_t sum_red = vqadd_qs16(vget_low_s16(vec_sum_value), vget_high_s16(vec_sum_value));
+ const qint16_t sum0 = sqadd_qs16(vget_lane_s16(sum_red, 0), vget_lane_s16(sum_red, 1));
+ const qint16_t sum1 = sqadd_qs16(vget_lane_s16(sum_red, 2), vget_lane_s16(sum_red, 3));
+ qint16_t sum = sqadd_qs16(sum0, sum1);
+
+ // Run remaining elements
+ for(int i = 0; i < small_steps; ++i)
+ {
+ qint8_t element = sqexp_qs8(sqsub_qs8(in_ptr[i], *max_ptr), fixed_point_position);
+ exp_ptr[i] = element;
+ sum = sqadd_qs16(sum, element);
+ }
+
+ *(reinterpret_cast<qint8_t *>(_sum.ptr())) = sqmovn_qs16(sum);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+} //namespace
NELogits1DShiftExpSumKernel::NELogits1DShiftExpSumKernel()
- : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr), _border_size(0)
+ : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
{
}
-BorderSize NELogits1DShiftExpSumKernel::border_size() const
-{
- return _border_size;
-}
void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, max, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, max, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
- const int input_width = input->info()->valid_region().shape.x();
- constexpr unsigned int num_elems_processed_per_iteration = 4;
+ unsigned int num_elems_processed_per_iteration = input->info()->valid_region().shape.x();
- _input = input;
- _max = max;
- _output = output;
- _sum = sum;
- _border_size = BorderSize(0, input_width % num_elems_processed_per_iteration, 0, 0);
+ switch(input->info()->data_type())
+ {
+ case DataType::QS8:
+ _func = &logits_1d_shift_exp_sum_qs8;
+ break;
+ case DataType::F32:
+ _func = &logits_1d_shift_exp_sum_f32;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ }
+
+ _input = input;
+ _max = max;
+ _output = output;
+ _sum = sum;
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
@@ -155,57 +347,87 @@
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
- Window window_max(window);
- window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+ (*_func)(_input, _max, _output, _sum, window);
+}
- Window max_slice = window_max.first_slice_window_1D();
+namespace
+{
+void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
+{
+ Window window_sum(window);
+ window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+ Window sum_slice = window_sum.first_slice_window_1D();
Window in_slice = window.first_slice_window_1D();
do
{
- Iterator in(_input, in_slice);
- Iterator exp(_output, in_slice);
- Iterator max(_max, max_slice);
- Iterator sum(_sum, max_slice);
+ Iterator input(in, in_slice);
+ Iterator _sum(sum, sum_slice);
+ Iterator output(out, in_slice);
- float32x4_t vec_sum_value = vdupq_n_f32(0.0f);
-
- const auto max_ptr = reinterpret_cast<const float *>(max.ptr());
- float32x4_t vec_max = vdupq_n_f32(*max_ptr);
+ const float sum_value = *reinterpret_cast<const float *>(_sum.ptr());
+ const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value);
execute_window_loop(in_slice, [&](const Coordinates & id)
{
- const auto in_ptr = reinterpret_cast<const float *>(in.ptr());
- const auto exp_ptr = reinterpret_cast<float *>(exp.ptr());
+ const auto in_ptr = reinterpret_cast<const float *>(input.ptr());
+ const auto out_ptr = reinterpret_cast<float *>(output.ptr());
- float32x4_t vec_elements = vld1q_f32(in_ptr);
- vec_elements = vsubq_f32(vec_elements, vec_max);
- vec_elements = vexpq_f32(vec_elements);
+ const float32x4_t vec_in = vld1q_f32(in_ptr);
+ const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed);
- vst1q_f32(exp_ptr, vec_elements);
-
- vec_sum_value = vaddq_f32(vec_elements, vec_sum_value);
+ vst1q_f32(out_ptr, normalized_value);
},
- in, exp);
-
- float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
- carry_addition = vpadd_f32(carry_addition, carry_addition);
-
- *(reinterpret_cast<float *>(sum.ptr())) = vget_lane_f32(carry_addition, 0);
+ input, output);
}
- while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
}
+void logits_1d_norm_qs8(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
+{
+ Window window_sum(window);
+ window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+ Window sum_slice = window_sum.first_slice_window_1D();
+ Window in_slice = window.first_slice_window_1D();
+
+ const int fixed_point_position = in->info()->fixed_point_position();
+
+ do
+ {
+ Iterator input(in, in_slice);
+ Iterator _sum(sum, sum_slice);
+ Iterator output(out, in_slice);
+
+ const int8_t sum_value = *reinterpret_cast<const qint8_t *>(_sum.ptr());
+ const qint8x16_t vec_sum_inversed = vqrecipq_qs8(vdupq_n_qs8(sum_value), fixed_point_position);
+
+ execute_window_loop(in_slice, [&](const Coordinates & id)
+ {
+ const auto in_ptr = reinterpret_cast<const qint8_t *>(input.ptr());
+ const auto out_ptr = reinterpret_cast<qint8_t *>(output.ptr());
+
+ const qint8x16_t vec_in = vld1q_qs8(in_ptr);
+ const qint8x16_t normalized_value = vqmulq_qs8(vec_in, vec_sum_inversed, fixed_point_position);
+
+ vst1q_qs8(out_ptr, normalized_value);
+ },
+ input, output);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
+} // namespace
NELogits1DNormKernel::NELogits1DNormKernel()
- : _input(nullptr), _sum(nullptr), _output(nullptr)
+ : _func(nullptr), _input(nullptr), _sum(nullptr), _output(nullptr)
{
}
void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, sum);
ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
_input = input;
@@ -213,7 +435,21 @@
_output = output;
// Configure kernel window
- constexpr unsigned int num_elems_processed_per_iteration = 4;
+ unsigned int num_elems_processed_per_iteration = 0;
+
+ switch(input->info()->data_type())
+ {
+ case DataType::QS8:
+ _func = &logits_1d_norm_qs8;
+ num_elems_processed_per_iteration = 16;
+ break;
+ case DataType::F32:
+ num_elems_processed_per_iteration = 4;
+ _func = &logits_1d_norm_f32;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type.");
+ }
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
@@ -232,32 +468,7 @@
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
- Window window_sum(window);
- window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
- Window sum_slice = window_sum.first_slice_window_1D();
- Window in_slice = window.first_slice_window_1D();
-
- do
- {
- Iterator in(_input, in_slice);
- Iterator sum(_sum, sum_slice);
- Iterator out(_output, in_slice);
-
- float sum_value = *reinterpret_cast<const float *>(sum.ptr());
- const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value);
-
- execute_window_loop(in_slice, [&](const Coordinates & id)
- {
- const auto in_ptr = reinterpret_cast<const float *>(in.ptr());
- const auto out_ptr = reinterpret_cast<float *>(out.ptr());
-
- const float32x4_t vec_in = vld1q_f32(in_ptr);
- const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed);
-
- vst1q_f32(out_ptr, normalized_value);
- },
- in, out);
- }
- while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+ (*_func)(_input, _sum, _output, window);
}
diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index d0ef58f..492de8a 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp
@@ -179,11 +179,20 @@
void NETransposeKernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ const size_t w_out = input->info()->dimension(1);
+ const size_t h_out = input->info()->dimension(0);
+ output_shape.set(0, w_out);
+ output_shape.set(1, h_out);
+
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(1));
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(0));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
_input = input;
_output = output;
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
new file mode 100644
index 0000000..aa6be44
--- /dev/null
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+namespace
+{
+template <typename T>
+void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
+{
+ const unsigned int kernel_size = input->info()->dimension(0);
+ const unsigned int kernel_depth = input->info()->dimension(2);
+ const unsigned int input_stride_x = input->info()->strides_in_bytes().x();
+ const unsigned int input_stride_y = input->info()->strides_in_bytes().y();
+ const unsigned int input_stride_z = input->info()->strides_in_bytes().z();
+ const unsigned int output_stride_y = output->info()->strides_in_bytes().y();
+
+ // Create iterators
+ Iterator in(input, window);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get column index
+ const int kernel_idx = id[3];
+ const int kernel_idz = id[4];
+
+ // Setup pointers
+ const uint8_t *tmp_input_ptr = in.ptr();
+ uint8_t *tmp_output_ptr = output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+ const uint8_t *curr_input_row_ptr = tmp_input_ptr;
+ const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+ // Linearize volume
+ for(unsigned int d = 0; d < kernel_depth; ++d)
+ {
+ for(unsigned int j = 0; j < kernel_size; ++j)
+ {
+ for(unsigned int i = 0; i < kernel_size; ++i)
+ {
+ *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(tmp_input_ptr));
+ tmp_input_ptr += input_stride_x;
+ tmp_output_ptr += output_stride_y;
+ }
+ curr_input_row_ptr += input_stride_y;
+ tmp_input_ptr = curr_input_row_ptr;
+ }
+ curr_input_depth_ptr += input_stride_z;
+ curr_input_row_ptr = curr_input_depth_ptr;
+ tmp_input_ptr = curr_input_depth_ptr;
+ }
+
+ // Add bias
+ if(bias != nullptr)
+ {
+ *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz))));
+ }
+ },
+ in);
+}
+} // namespace
+
+NEWeightsReshapeKernel::NEWeightsReshapeKernel()
+ : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != input->info()->dimension(1));
+
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ output_shape.collapse(3);
+ const size_t tmp_dim = output_shape[0];
+ output_shape.set(0, output_shape[1]);
+ output_shape.set(1, tmp_dim + (bias != nullptr ? 1 : 0));
+
+ // Set data type and shape for output tensor if not yet configured
+ set_data_type_if_unknown(*output->info(), dt);
+ set_fixed_point_position_if_zero(*output->info(), fixed_point_position);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+ if(bias != nullptr)
+ {
+ TensorShape bias_shape{ input->info()->tensor_shape()[3] };
+
+ // Set data type and shape for bias tensor if not yet configured
+ set_data_type_if_unknown(*bias->info(), dt);
+ set_fixed_point_position_if_zero(*bias->info(), fixed_point_position);
+ set_shape_if_empty(*bias->info(), bias_shape);
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(bias->info()->tensor_shape(), bias_shape);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F32, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ }
+
+ _input = input;
+ _bias = bias;
+ _output = output;
+
+ switch(_input->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ _func = &weights_reshape<uint32_t>;
+ break;
+ }
+ case DataType::QS8:
+ {
+ _func = &weights_reshape<uint8_t>;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR_ON("Data type not supported");
+ break;
+ }
+ }
+
+ // Configure kernel
+ Window window = calculate_max_window(*input->info(), Steps());
+ window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+ window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+ window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+
+ // The NEConvolutionLayerWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ INEKernel::configure(window);
+}
+
+void NEWeightsReshapeKernel::run(const Window &window)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ (*_func)(_input, _bias, _output, window);
+}
diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
new file mode 100644
index 0000000..f5a282d
--- /dev/null
+++ b/src/core/SubTensorInfo.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/SubTensorInfo.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+SubTensorInfo::SubTensorInfo()
+ : _parent(nullptr), _tensor_shape(), _coords(), _valid_region{ Coordinates(), _tensor_shape }
+{
+}
+
+SubTensorInfo::SubTensorInfo(ITensorInfo *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+ : _parent(parent), _tensor_shape(tensor_shape), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }
+{
+ ARM_COMPUTE_ERROR_ON(parent == nullptr);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape);
+
+ // Initialize valid region
+ Coordinates coordinates;
+ coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
+ _valid_region = ValidRegion{ coordinates, _tensor_shape };
+}
+
+void SubTensorInfo::set_tensor_shape(TensorShape shape)
+{
+ ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
+ _tensor_shape = shape;
+}
+
+bool SubTensorInfo::extend_padding(const PaddingSize &padding)
+{
+ ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+ ARM_COMPUTE_ERROR_ON(!_parent->is_resizable());
+
+ // Extend parent padding if required
+ return _parent->extend_padding(padding);
+}
+
+size_t SubTensorInfo::offset_element_in_bytes(const Coordinates &pos) const
+{
+ ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(pos, _tensor_shape.num_dimensions());
+
+ size_t offset = offset_first_element_in_bytes();
+ const Strides &strides = strides_in_bytes();
+
+ for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+ {
+ offset += pos[i] * strides[i];
+ }
+
+ return offset;
+}
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index e81c0de..3d07ccb 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -31,8 +31,29 @@
using namespace arm_compute;
TensorInfo::TensorInfo()
- : _total_size(0), _fixed_point_pos(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN), _is_resizable{ true },
- _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }
+ : _total_size(0), _fixed_point_position(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN),
+ _is_resizable{ true }, _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }
+{
+}
+
+TensorInfo::TensorInfo(const ITensorInfo &info)
+ : TensorInfo()
+{
+ _total_size = info.total_size();
+ _fixed_point_position = info.fixed_point_position();
+ _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
+ _strides_in_bytes = info.strides_in_bytes();
+ _num_channels = info.num_channels();
+ _tensor_shape = info.tensor_shape();
+ _data_type = info.data_type();
+ _format = info.format();
+ _is_resizable = info.is_resizable();
+ _valid_region = info.valid_region();
+ _padding = info.padding();
+}
+
+TensorInfo::TensorInfo(Format format)
+ : TensorInfo(TensorShape(), format)
{
}
@@ -47,10 +68,16 @@
init(tensor_shape, format);
}
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, size_t fixed_point_pos)
+TensorInfo::TensorInfo(size_t num_channels, DataType data_type, size_t fixed_point_position)
: TensorInfo()
{
- init(tensor_shape, num_channels, data_type, fixed_point_pos);
+ init(TensorShape(), num_channels, data_type, fixed_point_position);
+}
+
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position)
+ : TensorInfo()
+{
+ init(tensor_shape, num_channels, data_type, fixed_point_position);
}
TensorInfo::TensorInfo(const HOGInfo &hog_info, unsigned int width, unsigned int height)
@@ -59,6 +86,11 @@
init(hog_info, width, height);
}
+void TensorInfo::init(Format format)
+{
+ init(TensorShape(), format);
+}
+
void TensorInfo::init(const TensorShape &tensor_shape, Format format)
{
size_t num_channels = num_channels_from_format(format);
@@ -81,33 +113,34 @@
_format = format;
}
-void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, size_t fixed_point_pos)
+void TensorInfo::init(size_t num_channels, DataType data_type, size_t fixed_point_position)
{
- ARM_COMPUTE_ERROR_ON(0 == num_channels);
+ init(TensorShape(), num_channels, data_type, fixed_point_position);
+}
- _fixed_point_pos = fixed_point_pos;
- _data_type = data_type;
- _num_channels = num_channels;
- _format = Format::UNKNOWN;
- _tensor_shape = tensor_shape;
- _offset_first_element_in_bytes = 0;
- _strides_in_bytes = compute_strides(*this);
+void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position)
+{
+ ARM_COMPUTE_ERROR_ON(num_channels == 0);
+ ARM_COMPUTE_ERROR_ON(data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+ ARM_COMPUTE_ERROR_ON(data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
- const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
- _total_size = _tensor_shape[idx_last_dimension] * _strides_in_bytes[idx_last_dimension];
+ _fixed_point_position = fixed_point_position;
+ _data_type = data_type;
+ _num_channels = num_channels;
+ _format = Format::UNKNOWN;
- Coordinates coordinates;
- coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
- _valid_region = ValidRegion{ coordinates, _tensor_shape };
+ set_tensor_shape(tensor_shape);
}
void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type,
const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
- size_t total_size_in_bytes, size_t fixed_point_pos)
+ size_t total_size_in_bytes, int fixed_point_position)
{
- ARM_COMPUTE_ERROR_ON(0 == num_channels);
+ ARM_COMPUTE_ERROR_ON(num_channels == 0);
+ ARM_COMPUTE_ERROR_ON(data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+ ARM_COMPUTE_ERROR_ON(data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
- _fixed_point_pos = fixed_point_pos;
+ _fixed_point_position = fixed_point_position;
_data_type = data_type;
_num_channels = num_channels;
_format = Format::UNKNOWN;
@@ -146,15 +179,17 @@
return total_size;
}
-size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, size_t fixed_point_pos)
+size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position)
{
- ARM_COMPUTE_ERROR_ON(0 == num_channels);
+ ARM_COMPUTE_ERROR_ON(num_channels == 0);
+ ARM_COMPUTE_ERROR_ON(data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+ ARM_COMPUTE_ERROR_ON(data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
- _fixed_point_pos = fixed_point_pos;
- _data_type = data_type;
- _num_channels = num_channels;
- _format = Format::UNKNOWN;
- _tensor_shape = tensor_shape;
+ _fixed_point_position = fixed_point_position;
+ _data_type = data_type;
+ _num_channels = num_channels;
+ _format = Format::UNKNOWN;
+ _tensor_shape = tensor_shape;
Coordinates coordinates;
coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
@@ -183,10 +218,11 @@
{
ARM_COMPUTE_ERROR_ON(!_is_resizable);
- /* Some kernels compute 32 elements at the time, worst case scenario they will read 32 values after the last element */
- const size_t extra_pad_x = 32;
- const size_t pad_x = 4;
- const size_t pad_y = (_tensor_shape.num_dimensions() == 1 ? 0 : 4); // Skip pad_y if the tensor has just 1 dimension
+ // Some kernels compute 32 elements at the time, worst case scenario they
+ // will read 32 values after the last element
+ const size_t extra_pad_x = _tensor_shape.num_dimensions() < 1 ? 0 : 32;
+ const size_t pad_x = _tensor_shape.num_dimensions() < 1 ? 0 : 4;
+ const size_t pad_y = _tensor_shape.num_dimensions() < 2 ? 0 : 4;
return extend_padding(PaddingSize(pad_y, pad_x + extra_pad_x, pad_y, pad_x));
}
@@ -196,7 +232,7 @@
// Calculate resulting stride for the X, Y and Z dimension
const size_t stride_x = element_size();
const size_t stride_y = (padding.left + _tensor_shape[0] + padding.right) * stride_x;
- const size_t stride_z = _tensor_shape.num_dimensions() == 1 ? 0 : (padding.top + _tensor_shape[1] + padding.bottom) * stride_y;
+ const size_t stride_z = (padding.top + _tensor_shape[1] + padding.bottom) * stride_y;
Strides required_strides;
size_t required_total_size = 0;
@@ -204,9 +240,18 @@
switch(_tensor_shape.num_dimensions())
{
+ case 0:
+ {
+ if(_tensor_shape.total_size() > 0)
+ {
+ required_strides = Strides(stride_x);
+ required_total_size = stride_z;
+ }
+ break;
+ }
case 1:
required_strides = compute_strides(*this, stride_x);
- required_total_size = stride_y;
+ required_total_size = stride_z;
break;
case 2:
required_strides = compute_strides(*this, stride_x, stride_y);
@@ -261,12 +306,60 @@
return updated;
}
+void TensorInfo::set_data_type(DataType data_type)
+{
+ _data_type = data_type;
+ _format = Format::UNKNOWN;
+}
+
+void TensorInfo::set_num_channels(int num_channels)
+{
+ _num_channels = num_channels;
+ _format = Format::UNKNOWN;
+}
+
void TensorInfo::set_format(Format format)
{
- ARM_COMPUTE_ERROR_ON(num_channels_from_format(format) != _num_channels);
- ARM_COMPUTE_ERROR_ON(data_type_from_format(format) != _data_type);
-
_format = format;
+
+ if(_data_type == DataType::UNKNOWN)
+ {
+ _num_channels = num_channels_from_format(format);
+ _data_type = data_type_from_format(format);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON(num_channels_from_format(format) != _num_channels);
+ ARM_COMPUTE_ERROR_ON(data_type_from_format(format) != _data_type);
+ }
+}
+
+void TensorInfo::set_tensor_shape(TensorShape shape)
+{
+ _tensor_shape = shape;
+ _offset_first_element_in_bytes = 0;
+ _strides_in_bytes = compute_strides(*this);
+
+ if(_tensor_shape.num_dimensions() == 0)
+ {
+ _total_size = _strides_in_bytes[0];
+ }
+ else
+ {
+ const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
+ _total_size = _tensor_shape[idx_last_dimension] * _strides_in_bytes[idx_last_dimension];
+ }
+
+ Coordinates coordinates;
+ coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
+ _valid_region = ValidRegion{ coordinates, _tensor_shape };
+}
+
+void TensorInfo::set_fixed_point_position(int fixed_point_position)
+{
+ ARM_COMPUTE_ERROR_ON(_data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+ ARM_COMPUTE_ERROR_ON(_data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
+ _fixed_point_position = fixed_point_position;
}
size_t TensorInfo::offset_element_in_bytes(const Coordinates &pos) const
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index a6ece09..bf005c1 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -23,6 +23,8 @@
*/
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/FixedPoint.h"
+
#include <algorithm>
#include <cmath>
#include <cstdint>
@@ -128,8 +130,10 @@
{ DataType::UNKNOWN, "UNKNOWN" },
{ DataType::S8, "S8" },
{ DataType::U8, "U8" },
+ { DataType::QS8, "QS8" },
{ DataType::S16, "S16" },
{ DataType::U16, "U16" },
+ { DataType::QS16, "QS16" },
{ DataType::S32, "S32" },
{ DataType::U32, "U32" },
{ DataType::S64, "S64" },
@@ -210,10 +214,23 @@
return border_mode_map[border_mode];
}
-std::string arm_compute::lower_string(std::string val)
+const std::string &arm_compute::string_from_norm_type(NormType type)
{
- std::transform(val.begin(), val.end(), val.begin(), ::tolower);
- return val;
+ static std::map<NormType, const std::string> norm_type_map =
+ {
+ { NormType::IN_MAP_1D, "IN_MAP_1D" },
+ { NormType::IN_MAP_2D, "IN_MAP_2D" },
+ { NormType::CROSS_MAP, "CROSS_MAP" },
+ };
+
+ return norm_type_map[type];
+}
+
+std::string arm_compute::lower_string(const std::string &val)
+{
+ std::string res = val;
+ std::transform(res.begin(), res.end(), res.begin(), ::tolower);
+ return res;
}
const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height, unsigned int kernel_size,
@@ -259,6 +276,10 @@
case DataType::U8:
print_consecutive_elements_impl<uint8_t>(s, ptr, n, stream_width, element_delim);
break;
+ case DataType::QS8:
+ case DataType::S8:
+ print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, element_delim);
+ break;
case DataType::U16:
print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, element_delim);
break;
@@ -287,6 +308,9 @@
{
case DataType::U8:
return max_consecutive_elements_display_width_impl<uint8_t>(s, ptr, n);
+ case DataType::QS8:
+ case DataType::S8:
+ return max_consecutive_elements_display_width_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n);
case DataType::U16:
return max_consecutive_elements_display_width_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n);
case DataType::S16:
@@ -302,4 +326,4 @@
default:
ARM_COMPUTE_ERROR("Undefined element size for given data type");
}
-}
\ No newline at end of file
+}
diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index 1ab8dcc..ae2841d 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp
@@ -84,7 +84,7 @@
for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
{
- ARM_COMPUTE_ERROR_ON_LOC_MSG(win[i].start() != 0 || win[i].end() != 1,
+ ARM_COMPUTE_ERROR_ON_LOC_MSG(win[i].start() != 0 || win[i].end() != win[i].step(),
function, file, line,
"Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
}
@@ -172,3 +172,44 @@
function, file, line,
"This kernel hasn't been configured.");
}
+
+void arm_compute::error_on_invalid_subtensor(const char *function, const char *file, const int line,
+ const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape)
+{
+ ARM_COMPUTE_UNUSED(function);
+ ARM_COMPUTE_UNUSED(file);
+ ARM_COMPUTE_UNUSED(line);
+ ARM_COMPUTE_UNUSED(parent_shape);
+ ARM_COMPUTE_UNUSED(coords);
+ ARM_COMPUTE_UNUSED(shape);
+
+ // Subtensor should not index in x, y dimensions.
+ ARM_COMPUTE_ERROR_ON_LOC(((coords.x() != 0) && (coords.y() != 0)), function, file, line);
+ // Subtensor shape should match parent tensor in x, y dimensions.
+ ARM_COMPUTE_ERROR_ON_LOC(((parent_shape.x() != shape.x()) && (parent_shape.y() != parent_shape.y())), function, file, line);
+
+ // Check dimensions
+ for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_ERROR_ON_LOC(((coords[i] >= static_cast<int>(parent_shape[i])) || (coords[i] + static_cast<int>(shape[i]) > static_cast<int>(parent_shape[i]))),
+ function, file, line);
+ }
+}
+
+void arm_compute::error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
+ const ValidRegion &parent_valid_region, const ValidRegion &valid_region)
+{
+ ARM_COMPUTE_UNUSED(function);
+ ARM_COMPUTE_UNUSED(file);
+ ARM_COMPUTE_UNUSED(line);
+ ARM_COMPUTE_UNUSED(parent_valid_region);
+ ARM_COMPUTE_UNUSED(valid_region);
+
+ // Check valid regions
+ for(unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
+ {
+ ARM_COMPUTE_ERROR_ON_LOC((parent_valid_region.anchor[d] > valid_region.anchor[d]), function, file, line);
+ ARM_COMPUTE_ERROR_ON_LOC((parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) < (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
+ function, file, line);
+ }
+}
diff --git a/src/runtime/CL/CLHOG.cpp b/src/runtime/CL/CLHOG.cpp
new file mode 100644
index 0000000..3f5266c
--- /dev/null
+++ b/src/runtime/CL/CLHOG.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLHOG.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOG::CLHOG()
+ : _info(), _buffer()
+{
+}
+
+void CLHOG::init(const HOGInfo &input)
+{
+ ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+ _info = input;
+ _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info()->descriptor_size() * sizeof(float));
+}
+
+void CLHOG::free()
+{
+ ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+
+ _buffer = cl::Buffer();
+}
+
+const HOGInfo *CLHOG::info() const
+{
+ return &_info;
+}
+
+const cl::Buffer &CLHOG::cl_buffer() const
+{
+ return _buffer;
+}
+
+void CLHOG::map(bool blocking)
+{
+ ARM_COMPUTE_ERROR_ON(descriptor() != nullptr);
+ ICLHOG::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLHOG::unmap()
+{
+ ARM_COMPUTE_ERROR_ON(descriptor() == nullptr);
+ ICLHOG::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLHOG::do_map(cl::CommandQueue &q, bool blocking)
+{
+ ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+ return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size()));
+}
+
+void CLHOG::do_unmap(cl::CommandQueue &q)
+{
+ ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+ q.enqueueUnmapMemObject(_buffer, descriptor());
+}
\ No newline at end of file
diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp
new file mode 100644
index 0000000..b9e8739
--- /dev/null
+++ b/src/runtime/CL/CLMultiHOG.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMultiHOG.h"
+
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+CLMultiHOG::CLMultiHOG(size_t num_models)
+ : _num_models(num_models), _model(arm_compute::cpp14::make_unique<CLHOG[]>(_num_models))
+{
+}
+
+size_t CLMultiHOG::num_models() const
+{
+ return _num_models;
+}
+
+ICLHOG *CLMultiHOG::cl_model(size_t index)
+{
+ ARM_COMPUTE_ERROR_ON(index >= _num_models);
+ return (_model.get() + index);
+}
+
+const ICLHOG *CLMultiHOG::cl_model(size_t index) const
+{
+ ARM_COMPUTE_ERROR_ON(index >= _num_models);
+ return (_model.get() + index);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 1f3dbbe..fe25ce5 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -28,7 +28,7 @@
using namespace arm_compute;
CLScheduler::CLScheduler()
- : _context(), _queue()
+ : _context(), _queue(), _target(GPUTarget::MIDGARD)
{
}
diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
new file mode 100644
index 0000000..b228c0a
--- /dev/null
+++ b/src/runtime/CL/CLSubTensor.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+ : _parent(nullptr), _info()
+{
+ ARM_COMPUTE_ERROR_ON(parent == nullptr);
+ _info = SubTensorInfo(parent->info(), tensor_shape, coords);
+ _parent = parent;
+}
+
+ITensorInfo *CLSubTensor::info() const
+{
+ return &_info;
+}
+
+ITensorInfo *CLSubTensor::info()
+{
+ return &_info;
+}
+
+const cl::Buffer &CLSubTensor::cl_buffer() const
+{
+ ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+ return _parent->cl_buffer();
+}
+
+ICLTensor *CLSubTensor::parent()
+{
+ return _parent;
+}
+
+void CLSubTensor::map(bool blocking)
+{
+ ICLTensor::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLSubTensor::unmap()
+{
+ ICLTensor::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking)
+{
+ ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
+ return static_cast<uint8_t *>(q.enqueueMapBuffer(cl_buffer(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->total_size()));
+}
+
+void CLSubTensor::do_unmap(cl::CommandQueue &q)
+{
+ ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
+ q.enqueueUnmapMemObject(cl_buffer(), buffer());
+}
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
new file mode 100644
index 0000000..3df673c
--- /dev/null
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLBatchNormalizationLayer::CLBatchNormalizationLayer()
+ : _norm_kernel()
+{
+}
+
+void CLBatchNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
+{
+ _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+}
+
+void CLBatchNormalizationLayer::run()
+{
+ CLScheduler::get().enqueue(_norm_kernel, true);
+}
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index bb47bf9..f0bbc35 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -33,83 +33,155 @@
using namespace arm_compute;
-CLConvolutionLayer::CLConvolutionLayer()
- : _input_im2col_kernel(), _weights_reshape_kernel(), _input_interleave_kernel(), _weights_transposed_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
- _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _is_first_run(false), _has_bias(false), _is_fc(false)
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
+ : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
{
}
-void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases, output);
+ ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+ ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+ ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+ }
+
+ const bool _has_bias = (biases != nullptr);
+
+ _transpose1xW = transpose1xW;
+
+ if(transpose1xW)
+ {
+ // Create tensor to store the reshaped weights
+ const unsigned int mat_weights_cols = weights->info()->dimension(3);
+ const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+ TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+ const DataType dt = weights->info()->data_type();
+ TensorInfo info_wr(shape_wr, 1, dt);
+
+ _weights_reshaped.allocator()->init(info_wr);
+ _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+ _weights_transposed_kernel.configure(&_weights_reshaped, output);
+ _weights_reshaped.allocator()->allocate();
+ }
+ else
+ {
+ _weights_reshape_kernel.configure(weights, biases, output);
+ }
+}
+
+void CLConvolutionLayerReshapeWeights::run()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+ CLScheduler::get().enqueue(_weights_reshape_kernel);
+ if(_transpose1xW)
+ {
+ CLScheduler::get().enqueue(_weights_transposed_kernel);
+ }
+}
+
+CLConvolutionLayer::CLConvolutionLayer()
+ : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
+ _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+{
+}
+
+void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
if(biases != nullptr)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+ ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
}
- _has_bias = (biases != nullptr);
- _is_first_run = true;
+ _has_bias = (biases != nullptr);
+ _are_weights_reshaped = weights_info.are_reshaped();
// Get parameters for conv_info
- unsigned int stride_x, stride_y, pad_x, pad_y = 0;
+ unsigned int stride_x = 0;
+ unsigned int stride_y = 0;
+ unsigned int pad_x = 0;
+ unsigned int pad_y = 0;
std::tie(stride_x, stride_y) = conv_info.stride();
std::tie(pad_x, pad_y) = conv_info.pad();
- bool is_same_dimension = true;
- // Make sure the input and weights have same low three dimensions
- for(int i = 0; i < 3; i++)
- {
- is_same_dimension = (is_same_dimension) && (input->info()->dimension(i) == weights->info()->dimension(i));
- }
-
- // Run the fully connected path if is_same_dimension is true and conv_stride_x/conv_stride_y are 1, and conv_pad_x/conv_pad_y are 0 and skip col2im
- _is_fc = (is_same_dimension) && ((stride_x & stride_y) == 1) && ((pad_x | pad_y) == 0);
-
// Get convolved dimensions
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+
+ const unsigned int kernel_width = _are_weights_reshaped ? weights_info.kernel_size() : weights->info()->dimension(0);
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
stride_x, stride_y, pad_x, pad_y, conv_info.round());
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+ // Check if its a "fully connected" convolution
+ _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+
// Create tensor to store the reshaped weights
- const size_t mat_weights_cols = weights->info()->dimension(3);
- const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
- const TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
- _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
-
- // Create tensor to store transposed weights
- TensorShape shape_wt(mat_weights_rows * 4, static_cast<size_t>(std::ceil(mat_weights_cols / 4.f)));
- TensorInfo info_wt(shape_wt, 1, weights->info()->data_type());
- _weights_transposed.allocator()->init(info_wt);
-
+ size_t mat_weights_cols = weights->info()->dimension(3);
+ size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+ if(_are_weights_reshaped)
+ {
+ mat_weights_cols = output->info()->dimension(2);
+ const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
+ mat_weights_rows = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+ }
+ else
+ {
+ if(_is_fully_connected_convolution)
+ {
+ // Create tensor to store the reshaped weights
+ TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+ TensorInfo info_wr(shape_wr, 1, weights->info()->data_type());
+ _weights_reshaped.allocator()->init(info_wr);
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, false);
+ weights = &_weights_reshaped;
+ }
+ else
+ {
+ // Create tensor to store transposed weights
+ TensorShape shape_wt(mat_weights_rows * 4, static_cast<size_t>(std::ceil(mat_weights_cols / 4.f)));
+ TensorInfo info_wt(shape_wt, 1, weights->info()->data_type());
+ _weights_transposed.allocator()->init(info_wt);
+ _reshape_weights.configure(weights, biases, &_weights_transposed, true);
+ weights = &_weights_transposed;
+ }
+ }
// Create tensor to store im2col reshaped inputs
const size_t mat_input_cols = mat_weights_rows;
- const size_t mat_input_rows = _is_fc ? (input->info()->dimension(3)) : (conv_w * conv_h);
+ const size_t mat_input_rows = conv_w * conv_h;
TensorShape shape_im2col = input->info()->tensor_shape();
shape_im2col.set(0, mat_input_cols);
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
- if(_is_fc)
- {
- shape_im2col.set(3, 1);
- }
_input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
- // Create tensor to prepare input tensor for GEMM
- TensorShape shape_interleaved = shape_im2col;
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+ // Create tensor (interleave) to prepare input tensor for GEMM
+ if(!_is_fully_connected_convolution)
+ {
+ TensorShape shape_interleaved = shape_im2col;
+ shape_interleaved.set(0, shape_interleaved.x() * 4);
+ shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4.f));
+ _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+ }
// Create GEMM output tensor
TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
@@ -119,48 +191,57 @@
// Configure kernels
_input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
- _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
- _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
- _weights_transposed_kernel.configure(&_weights_reshaped, &_weights_transposed);
- if(_is_fc)
+ _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+ if(_is_fully_connected_convolution)
{
- _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, output, 1.0f);
+ _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
}
else
{
- _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, &_gemm_output, 1.0f);
- _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+ _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+ _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
}
- // Allocate intermediate tensors
- _weights_reshaped.allocator()->allocate();
- _weights_transposed.allocator()->allocate();
+ if(!_are_weights_reshaped)
+ {
+ if(!_is_fully_connected_convolution)
+ {
+ _weights_transposed.allocator()->allocate();
+ }
+ else
+ {
+ _weights_reshaped.allocator()->allocate();
+ }
+ }
+
_input_im2col_reshaped.allocator()->allocate();
- _input_interleaved_reshaped.allocator()->allocate();
+ if(!_is_fully_connected_convolution)
+ {
+ _input_interleaved_reshaped.allocator()->allocate();
+ }
_gemm_output.allocator()->allocate();
}
void CLConvolutionLayer::run()
{
// Run weights reshaping (Runs once for every configure)
- if(_is_first_run)
+ if(!_are_weights_reshaped)
{
- _is_first_run = false;
- CLScheduler::get().enqueue(_weights_reshape_kernel);
- CLScheduler::get().enqueue(_weights_transposed_kernel);
+ _are_weights_reshaped = true;
+ _reshape_weights.run();
}
// Run input reshaping
CLScheduler::get().enqueue(_input_im2col_kernel);
- CLScheduler::get().enqueue(_input_interleave_kernel);
+ if(!_is_fully_connected_convolution)
+ {
+ CLScheduler::get().enqueue(_input_interleave_kernel);
+ }
// Runs matrix multiply on reshaped matrices
CLScheduler::get().enqueue(_mm_kernel);
// Reshape output matrix
-
- if(!_is_fc)
- {
- CLScheduler::get().enqueue(_output_col2im_kernel, false);
- }
+ CLScheduler::get().enqueue(_output_col2im_kernel, false);
}
diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp
new file mode 100644
index 0000000..d967d98
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthConcatenate.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDepthConcatenate::CLDepthConcatenate()
+ : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+{
+}
+
+void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+ _num_inputs = inputs_vector.size();
+
+ unsigned int depth_offset = 0;
+
+ _concat_kernels_vector = arm_compute::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
+ _border_handlers_vector = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+
+ for(unsigned int i = 0; i < _num_inputs; i++)
+ {
+ _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+ _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+ depth_offset += inputs_vector.at(i)->info()->dimension(2);
+ }
+}
+
+void CLDepthConcatenate::run()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ for(unsigned i = 0; i < _num_inputs; i++)
+ {
+ CLScheduler::get().enqueue(_border_handlers_vector[i], false);
+ CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+ }
+}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 08e18df..57d57d5 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -31,35 +31,99 @@
using namespace arm_compute;
+CLFullyConnectedLayerReshapeWeights::CLFullyConnectedLayerReshapeWeights()
+ : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+{
+}
+
+void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
+ ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
+ _transpose_weights = transpose_weights;
+ _is_batched_fc_layer = is_batched_fc_layer;
+
+ // Check if we need to transpose the weights
+ if(_transpose_weights)
+ {
+ if(_is_batched_fc_layer)
+ {
+ // Initialize the output tensor for transpose
+ TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
+ _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+ _transpose_kernel.configure(input, &_transpose_output);
+
+ // Configure transpose 1xW kernel
+ _transpose1xW_kernel.configure(&_transpose_output, output);
+
+ // Allocate temporary tensor used for transposing the weights
+ _transpose_output.allocator()->allocate();
+ }
+ else
+ {
+ _transpose_kernel.configure(input, output);
+ }
+ }
+ else
+ {
+ if(_is_batched_fc_layer)
+ {
+ // Configure transpose 1xW kernel
+ _transpose1xW_kernel.configure(input, output);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
+ }
+ }
+}
+
+void CLFullyConnectedLayerReshapeWeights::run()
+{
+ if(_transpose_weights)
+ {
+ CLScheduler::get().enqueue(_transpose_kernel, _is_batched_fc_layer);
+ }
+ if(_is_batched_fc_layer)
+ {
+ CLScheduler::get().enqueue(_transpose1xW_kernel);
+ }
+}
+
CLFullyConnectedLayer::CLFullyConnectedLayer()
- : _im2col_kernel(), _transpose_kernel(), _transpose1xW_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _transpose_output(),
- _transpose1xW_output(), _is_first_run(true), _transpose_weights(true), _fc_after_conv(true), _batched_fc_layer(false), _accumulate_biases(false)
+ : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
+ _are_weights_reshaped(true), _is_fc_after_conv(true), _is_batched_fc_layer(false), _accumulate_biases(false)
{
}
void CLFullyConnectedLayer::configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
// Initialize output tensor for im2col
TensorShape shape_im2col;
- shape_im2col.set(0, weights->info()->dimension(1));
+ shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
shape_im2col.set(1, input->info()->dimension(3));
shape_im2col.set(2, input->info()->dimension(4));
shape_im2col.set(3, input->info()->dimension(5));
- _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+ _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
// Initialize output tensor for interleave 4x4
TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
shape_interleaved.set(0, shape_interleaved.x() * 4);
shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
-
- // Initialize output tensor for transpose 1xW
- TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
- _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+ _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
// Configure im2col kernel
_im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
@@ -67,55 +131,49 @@
// Configure interleave4x4 kernel
_interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
- // Configure transpose 1xW kernel
- _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
-
// Configure matrix multiply kernel
- _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+ _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
// Allocate the tensors once all the configure methods have been called
_im2col_output.allocator()->allocate();
_interleave4x4_output.allocator()->allocate();
- _transpose1xW_output.allocator()->allocate();
}
void CLFullyConnectedLayer::configure_fc_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
// Initialize output tensor for interleave 4x4
TensorShape shape_interleaved = input->info()->tensor_shape();
shape_interleaved.set(0, shape_interleaved.x() * 4);
shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
-
- // Initialize output tensor for transpose 1xW
- TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
- _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+ _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
// Configure interleave4x4 kernel
_interleave4x4_kernel.configure(input, &_interleave4x4_output);
- // Configure transpose 1xW kernel
- _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
-
// Configure matrix multiply kernel
- _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+ _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
// Allocate the tensors once all the configure methods have been called
_interleave4x4_output.allocator()->allocate();
- _transpose1xW_output.allocator()->allocate();
}
void CLFullyConnectedLayer::configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
// Initialize output tensor for im2col
TensorShape shape_im2col;
- shape_im2col.set(0, weights->info()->dimension(1));
+ shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
shape_im2col.set(1, 1);
- _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+ _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
// Configure im2col kernel
_im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
@@ -135,20 +193,20 @@
_mm_kernel.configure(input, weights, output, 1.0f);
}
-void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights)
+void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
- const ICLTensor *weights_to_use = weights;
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
- _is_first_run = true;
- _transpose_weights = transpose_weights;
- _fc_after_conv = true;
- _batched_fc_layer = false;
- _accumulate_biases = false;
+ _are_weights_reshaped = are_weights_reshaped;
+ _is_fc_after_conv = true;
+ _is_batched_fc_layer = false;
+ _accumulate_biases = false;
if(biases != nullptr)
{
@@ -160,17 +218,6 @@
_accumulate_biases_kernel.configure(output, biases);
}
- // Check if we need to transpose the weights
- if(_transpose_weights)
- {
- // Initialize the output tensor for transpose
- TensorShape shape_transposed(weights->info()->dimension(1), weights->info()->dimension(0));
- _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, weights->info()->data_type()));
- _transpose_kernel.configure(weights, &_transpose_output);
-
- weights_to_use = &_transpose_output;
- }
-
// With the Fully Connected layer we can have 4 different cases:
// 1) Convolution layer -> Fully Connected layer without batches
// 2) Fully Connected layer -> Fully Connected layer without batches
@@ -178,15 +225,54 @@
// 4) Fully Connected layer -> Fully Connected layer with batches
// Check if we have a fully connected layer with batches
- _batched_fc_layer = (output->info()->dimension(1) > 1);
+ _is_batched_fc_layer = (output->info()->dimension(1) > 1);
- if(_batched_fc_layer)
+ const ICLTensor *weights_to_use = weights;
+
+ if(!are_weights_reshaped)
{
- _fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
- input->info()->tensor_shape().cend(),
- output->info()->tensor_shape().cbegin() + 1));
+ if((transpose_weights || _is_batched_fc_layer))
+ {
+ weights_to_use = &_reshape_weights_output;
- if(_fc_after_conv)
+ if(transpose_weights)
+ {
+ if(_is_batched_fc_layer)
+ {
+ const float transpose_width = 16.0f / input->info()->element_size();
+ TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _reshape_weights_output.allocator()->init(info_wt);
+ }
+ else
+ {
+ TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _reshape_weights_output.allocator()->init(info_wt);
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
+
+ const float transpose_width = 16.0f / input->info()->element_size();
+ TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _reshape_weights_output.allocator()->init(info_wt);
+ }
+
+ // Reshape the weights
+ _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+ }
+ }
+
+ if(_is_batched_fc_layer)
+ {
+ _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+ input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1));
+
+ if(_is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer with batches
configure_conv_fc_wb(input, weights_to_use, output);
@@ -199,9 +285,10 @@
}
else
{
- _fc_after_conv = (weights_to_use->info()->dimension(1) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+ // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
+ _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
- if(_fc_after_conv)
+ if(_is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer without batches
configure_conv_fc_nb(input, weights_to_use, output);
@@ -213,39 +300,34 @@
}
}
- // Allocate the transpose tensor if the transpose_weights flag is true and once all the configure methods have been called
- if(_transpose_weights)
+ // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
+ if(!are_weights_reshaped)
{
- _transpose_output.allocator()->allocate();
+ if(transpose_weights || _is_batched_fc_layer)
+ {
+ // Allocate the tensor for the weights reshaped
+ _reshape_weights_output.allocator()->allocate();
+ }
}
}
void CLFullyConnectedLayer::run()
{
- // The reshape of the weights happens only once
- if(_is_first_run)
+ // Reshape of the weights (happens only once)
+ if(!_are_weights_reshaped)
{
- _is_first_run = false;
-
- if(_transpose_weights)
- {
- CLScheduler::get().enqueue(_transpose_kernel);
- }
-
- if(_batched_fc_layer)
- {
- CLScheduler::get().enqueue(_transpose1xW_kernel);
- }
+ _are_weights_reshaped = true;
+ _reshape_weights_kernel.run();
}
// Linearize input if it comes from a convolutional layer
- if(_fc_after_conv)
+ if(_is_fc_after_conv)
{
CLScheduler::get().enqueue(_im2col_kernel, false);
}
// Interleave input
- if(_batched_fc_layer)
+ if(_is_batched_fc_layer)
{
CLScheduler::get().enqueue(_interleave4x4_kernel, false);
}
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
new file mode 100644
index 0000000..b1b5a03
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOGDescriptor::CLHOGDescriptor()
+ : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+{
+}
+
+void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON(nullptr == output);
+ ARM_COMPUTE_ERROR_ON(nullptr == hog);
+
+ const HOGInfo *hog_info = hog->info();
+ const size_t width = input->info()->dimension(Window::DimX);
+ const size_t height = input->info()->dimension(Window::DimY);
+ const size_t num_bins = hog_info->num_bins();
+
+ Size2D cell_size = hog_info->cell_size();
+
+ // Calculate number of cells along the x and y directions for the hog_space
+ const size_t num_cells_x = width / cell_size.width;
+ const size_t num_cells_y = height / cell_size.height;
+
+ // TensorShape of the input image
+ const TensorShape &shape_img = input->info()->tensor_shape();
+
+ // TensorShape of the hog space
+ TensorShape shape_hog_space = input->info()->tensor_shape();
+ shape_hog_space.set(Window::DimX, num_cells_x);
+ shape_hog_space.set(Window::DimY, num_cells_y);
+
+ // Intitialize tensors for magnitude, phase and hog space
+ TensorInfo info_mag(shape_img, Format::S16);
+ _mag.allocator()->init(info_mag);
+
+ TensorInfo info_phase(shape_img, Format::U8);
+ _phase.allocator()->init(info_phase);
+
+ TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+ _hog_space.allocator()->init(info_space);
+
+ // Initialise gradient kernel
+ _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+
+ // Initialise orientation binning kernel
+ _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
+
+ // Initialize HOG norm kernel
+ _block_norm.configure(&_hog_space, output, hog->info());
+
+ // Allocate intermediate tensors
+ _mag.allocator()->allocate();
+ _phase.allocator()->allocate();
+ _hog_space.allocator()->allocate();
+}
+
+void CLHOGDescriptor::run()
+{
+ // Run gradient
+ _gradient.run();
+
+ // Run orientation binning
+ CLScheduler::get().enqueue(_orient_bin, false);
+
+ // Run block normalization
+ CLScheduler::get().enqueue(_block_norm);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp
new file mode 100644
index 0000000..8eb5e42
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGDetector.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+
+CLHOGDetector::CLHOGDetector()
+ : _hog_detector_kernel(), _detection_windows(nullptr), _num_detection_windows()
+{
+}
+
+void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
+{
+ _detection_windows = detection_windows;
+
+ // Allocate buffer for storing the number of detected objects
+ _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
+
+ // Configure HOGDetectorKernel
+ _hog_detector_kernel.configure(input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
+}
+
+void CLHOGDetector::run()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ // Reset number of detections
+ const unsigned int init_num_detection_windows = _detection_windows->num_values();
+ q.enqueueWriteBuffer(_num_detection_windows, CL_FALSE, 0, sizeof(unsigned int), &init_num_detection_windows);
+
+ // Run CLHOGDetectorKernel
+ CLScheduler::get().enqueue(_hog_detector_kernel);
+
+ // Read number of detections
+ unsigned int num_detection_windows = 0;
+ q.enqueueReadBuffer(_num_detection_windows, CL_TRUE, 0, sizeof(unsigned int), &num_detection_windows);
+
+ // Update the number of values stored in _detection_windows
+ _detection_windows->resize(static_cast<size_t>(num_detection_windows));
+
+ q.flush();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
new file mode 100644
index 0000000..2387474
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOGGradient::CLHOGGradient()
+ : _derivative(), _mag_phase(), _gx(), _gy()
+{
+}
+
+void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8);
+
+ const TensorShape &shape_img = input->info()->tensor_shape();
+
+ // Allocate image memory
+ TensorInfo info(shape_img, Format::S16);
+ _gx.allocator()->init(info);
+ _gy.allocator()->init(info);
+
+ // Initialise derivate kernel
+ _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
+
+ // Initialise magnitude/phase kernel
+ if(PhaseType::UNSIGNED == phase_type)
+ {
+ _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
+ }
+ else
+ {
+ _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
+ }
+
+ // Allocate intermediate tensors
+ _gx.allocator()->allocate();
+ _gy.allocator()->allocate();
+}
+
+void CLHOGGradient::run()
+{
+ // Run derivative
+ _derivative.run();
+
+ // Run magnitude/phase kernel
+ CLScheduler::get().enqueue(_mag_phase);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
new file mode 100644
index 0000000..b8f2224
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+using namespace arm_compute;
+
+CLHOGMultiDetection::CLHOGMultiDetection()
+ : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
+ _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+{
+}
+
+void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
+ uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
+ ARM_COMPUTE_ERROR_ON(nullptr == detection_windows);
+ ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models());
+
+ const size_t width = input->info()->dimension(Window::DimX);
+ const size_t height = input->info()->dimension(Window::DimY);
+ const TensorShape &shape_img = input->info()->tensor_shape();
+ const size_t num_models = multi_hog->num_models();
+ PhaseType phase_type = multi_hog->model(0)->info()->phase_type();
+
+ size_t prev_num_bins = multi_hog->model(0)->info()->num_bins();
+ Size2D prev_cell_size = multi_hog->model(0)->info()->cell_size();
+ Size2D prev_block_size = multi_hog->model(0)->info()->block_size();
+ Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride();
+
+ /* Check if CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object
+ *
+ * 1) CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change.
+ * Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+ * 2) CLHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change.
+ * Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+ *
+ * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel
+ * with "input_orient_bin", "input_hog_detect" and "input_block_norm"
+ */
+ std::vector<size_t> input_orient_bin;
+ std::vector<size_t> input_hog_detect;
+ std::vector<std::pair<size_t, size_t>> input_block_norm;
+
+ input_orient_bin.push_back(0);
+ input_hog_detect.push_back(0);
+ input_block_norm.emplace_back(0, 0);
+
+ for(size_t i = 1; i < num_models; ++i)
+ {
+ size_t cur_num_bins = multi_hog->model(i)->info()->num_bins();
+ Size2D cur_cell_size = multi_hog->model(i)->info()->cell_size();
+ Size2D cur_block_size = multi_hog->model(i)->info()->block_size();
+ Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride();
+
+ if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
+ {
+ prev_num_bins = cur_num_bins;
+ prev_cell_size = cur_cell_size;
+ prev_block_size = cur_block_size;
+ prev_block_stride = cur_block_stride;
+
+ // Compute orientation binning and block normalization kernels. Update input to process
+ input_orient_bin.push_back(i);
+ input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+ }
+ else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
+ || (cur_block_stride.height != prev_block_stride.height))
+ {
+ prev_block_size = cur_block_size;
+ prev_block_stride = cur_block_stride;
+
+ // Compute block normalization kernel. Update input to process
+ input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+ }
+
+ // Update input to process for hog detector kernel
+ input_hog_detect.push_back(input_block_norm.size() - 1);
+ }
+
+ _detection_windows = detection_windows;
+ _non_maxima_suppression = non_maxima_suppression;
+ _num_orient_bin_kernel = input_orient_bin.size(); // Number of CLHOGOrientationBinningKernel kernels to compute
+ _num_block_norm_kernel = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
+ _num_hog_detect_kernel = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
+
+ _orient_bin_kernel = arm_compute::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+ _block_norm_kernel = arm_compute::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+ _hog_detect_kernel = arm_compute::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
+ _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+ _hog_space = arm_compute::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
+ _hog_norm_space = arm_compute::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
+
+ // Allocate tensors for magnitude and phase
+ TensorInfo info_mag(shape_img, Format::S16);
+ _mag.allocator()->init(info_mag);
+
+ TensorInfo info_phase(shape_img, Format::U8);
+ _phase.allocator()->init(info_phase);
+
+ // Initialise gradient kernel
+ _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
+
+ // Configure NETensor for the HOG space and orientation binning kernel
+ for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+ {
+ const size_t idx_multi_hog = input_orient_bin[i];
+
+ // Get the corresponding cell size and number of bins
+ const Size2D &cell = multi_hog->model(idx_multi_hog)->info()->cell_size();
+ const size_t num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
+
+ // Calculate number of cells along the x and y directions for the hog_space
+ const size_t num_cells_x = width / cell.width;
+ const size_t num_cells_y = height / cell.height;
+
+ // TensorShape of hog space
+ TensorShape shape_hog_space = input->info()->tensor_shape();
+ shape_hog_space.set(Window::DimX, num_cells_x);
+ shape_hog_space.set(Window::DimY, num_cells_y);
+
+ // Allocate HOG space
+ TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+ _hog_space[i].allocator()->init(info_space);
+
+ // Initialise orientation binning kernel
+ _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+ }
+
+ // Configure CLTensor for the normalized HOG space and block normalization kernel
+ for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+ {
+ const size_t idx_multi_hog = input_block_norm[i].first;
+ const size_t idx_orient_bin = input_block_norm[i].second;
+
+ // Allocate normalized HOG space
+ TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
+ _hog_norm_space[i].allocator()->init(tensor_info);
+
+ // Initialize block normalization kernel
+ _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+ }
+
+ detection_window_strides->map(CLScheduler::get().queue(), true);
+
+ // Configure HOG detector kernel
+ for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+ {
+ const size_t idx_block_norm = input_hog_detect[i];
+
+ _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+ }
+
+ detection_window_strides->unmap(CLScheduler::get().queue());
+
+ // Configure non maxima suppression kernel
+ _non_maxima_kernel->configure(_detection_windows, min_distance);
+
+ // Allocate intermediate tensors
+ _mag.allocator()->allocate();
+ _phase.allocator()->allocate();
+
+ for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+ {
+ _hog_space[i].allocator()->allocate();
+ }
+
+ for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+ {
+ _hog_norm_space[i].allocator()->allocate();
+ }
+}
+
+void CLHOGMultiDetection::run()
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
+
+ // Reset detection window
+ _detection_windows->clear();
+
+ // Run gradient
+ _gradient_kernel.run();
+
+ // Run orientation binning kernel
+ for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+ {
+ CLScheduler::get().enqueue(*(_orient_bin_kernel.get() + i), false);
+ }
+
+ // Run block normalization kernel
+ for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+ {
+ CLScheduler::get().enqueue(*(_block_norm_kernel.get() + i), false);
+ }
+
+ // Run HOG detector kernel
+ for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+ {
+ _hog_detect_kernel[i].run();
+ }
+
+ // Run non-maxima suppression kernel if enabled
+ if(_non_maxima_suppression)
+ {
+ // Map detection windows array before computing non maxima suppression
+ _detection_windows->map(CLScheduler::get().queue(), true);
+ _non_maxima_kernel->run(_non_maxima_kernel->window());
+ _detection_windows->unmap(CLScheduler::get().queue());
+ }
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 6501da3..2db277f 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -34,8 +34,8 @@
#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
-#include "arm_compute/runtime/CPP/CPPScheduler.h"
#include "arm_compute/runtime/ITensorAllocator.h"
+#include "arm_compute/runtime/Scheduler.h"
#include <cmath>
#include <utility>
@@ -148,7 +148,7 @@
// Run corner candidate kernel
_nonmax.map(true);
- CPPScheduler::get().multithread(&_candidates);
+ Scheduler::get().schedule(&_candidates, Window::DimY);
_nonmax.unmap();
_corners->map(CLScheduler::get().queue(), true);
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
new file mode 100644
index 0000000..263fb51
--- /dev/null
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLLocallyConnectedLayer::CLLocallyConnectedLayer()
+ : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+{
+}
+
+void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+ ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
+ }
+
+ bool _has_bias = (biases != nullptr);
+ _is_first_run = true;
+
+ // Get parameters for conv_info
+ unsigned int stride_x = 0;
+ unsigned int stride_y = 0;
+ unsigned int pad_x = 0;
+ unsigned int pad_y = 0;
+ std::tie(stride_x, stride_y) = conv_info.stride();
+ std::tie(pad_x, pad_y) = conv_info.pad();
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+ stride_x, stride_y, pad_x, pad_y, conv_info.round());
+
+ ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+ ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+ // Create tensor to store the reshaped weights
+ const size_t mat_weights_cols = weights->info()->dimension(3);
+ const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+ const size_t mat_weights_num = weights->info()->dimension(4);
+
+ const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+ _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+
+ // Create tensor to store im2col reshaped inputs
+ const size_t mat_input_cols = mat_weights_rows;
+ const size_t mat_input_rows = conv_w * conv_h;
+ TensorShape shape_im2col = input->info()->tensor_shape();
+ shape_im2col.set(0, mat_input_cols);
+ shape_im2col.set(1, mat_input_rows);
+ shape_im2col.set(2, 1);
+
+ _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+ // Create locally connected layer output tensor
+ TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, mat_input_rows);
+ _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+ // Configure kernels
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+ _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+ _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+ _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+ // Allocate intermediate tensors
+ _weights_reshaped.allocator()->allocate();
+ _input_im2col_reshaped.allocator()->allocate();
+ _gemm_output.allocator()->allocate();
+}
+
+void CLLocallyConnectedLayer::run()
+{
+ // Run weights reshaping (Runs once for every configure)
+ if(_is_first_run)
+ {
+ _is_first_run = false;
+ CLScheduler::get().enqueue(_weights_reshape_kernel);
+ }
+
+ // Run input reshaping
+ CLScheduler::get().enqueue(_input_im2col_kernel);
+
+ // Runs vector matrix multiply on reshaped matrices
+ CLScheduler::get().enqueue(_mm_kernel);
+
+ // Reshape output matrix
+ CLScheduler::get().enqueue(_output_col2im_kernel, false);
+}
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 210dbb7..8869330 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -35,14 +35,6 @@
using namespace arm_compute;
-#ifdef NO_MULTI_THREADING
-namespace
-{
-void delete_threads(Thread *t)
-{
-}
-}
-#else /* NO_MULTI_THREADING */
class arm_compute::Thread
{
public:
@@ -162,7 +154,6 @@
delete[] t;
}
} // namespace
-#endif /* NO_MULTI_THREADING */
CPPScheduler &CPPScheduler::get()
{
@@ -170,49 +161,39 @@
return scheduler;
}
+unsigned int CPPScheduler::num_threads() const
+{
+ return _num_threads;
+}
+
CPPScheduler::CPPScheduler()
- : _num_threads(0), _threads(nullptr, delete_threads)
+ : _num_threads(std::thread::hardware_concurrency()),
+ _threads(std::unique_ptr<Thread[], void(*)(Thread *)>(new Thread[std::thread::hardware_concurrency() - 1], delete_threads))
{
- force_number_of_threads(0);
}
-void CPPScheduler::force_number_of_threads(int num_threads)
+void CPPScheduler::set_num_threads(unsigned int num_threads)
{
-#ifdef NO_MULTI_THREADING
- ARM_COMPUTE_ERROR_ON(num_threads > 1);
- _num_threads = 1;
-#else /* NO_MULTI_THREADING */
- _num_threads = num_threads > 0 ? num_threads : std::thread::hardware_concurrency();
- ARM_COMPUTE_ERROR_ON(_num_threads < 1);
-
- if(_num_threads > 1)
- {
- _threads = std::unique_ptr<Thread[], void (*)(Thread *)>(new Thread[_num_threads - 1], delete_threads);
- }
- else
- {
- _threads = nullptr;
- }
-#endif /* NO_MULTI_THREADING */
+ const unsigned int num_cores = std::thread::hardware_concurrency();
+ _num_threads = num_threads == 0 ? num_cores : num_threads;
}
-void CPPScheduler::multithread(ICPPKernel *kernel, const size_t split_dimension)
+void CPPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
{
ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
/** [Scheduler example] */
- const Window &max_window = kernel->window();
- const int num_iterations = max_window.num_iterations(split_dimension);
- int num_threads = std::min(num_iterations, _num_threads);
+ const Window &max_window = kernel->window();
+ const unsigned int num_iterations = max_window.num_iterations(split_dimension);
+ const unsigned int num_threads = std::min(num_iterations, _num_threads);
if(!kernel->is_parallelisable() || 1 == num_threads)
{
kernel->run(max_window);
}
-#ifndef NO_MULTI_THREADING
else
{
- for(int t = 0; t < num_threads; ++t)
+ for(unsigned int t = 0; t < num_threads; ++t)
{
Window win = max_window.split_window(split_dimension, t, num_threads);
win.set_thread_id(t);
@@ -230,7 +211,7 @@
try
{
- for(int t = 1; t < num_threads; ++t)
+ for(unsigned int t = 1; t < num_threads; ++t)
{
_threads[t - 1].wait();
}
@@ -240,6 +221,5 @@
std::cout << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
}
}
-#endif /* NO_MULTI_THREADING */
/** [Scheduler example] */
}
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
new file mode 100644
index 0000000..f086813
--- /dev/null
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/SingleThreadScheduler.h"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+SingleThreadScheduler &SingleThreadScheduler::get()
+{
+ static SingleThreadScheduler scheduler;
+ return scheduler;
+}
+
+void SingleThreadScheduler::set_num_threads(unsigned int num_threads)
+{
+ ARM_COMPUTE_UNUSED(num_threads);
+}
+
+void SingleThreadScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+{
+ ARM_COMPUTE_UNUSED(split_dimension);
+ kernel->run(kernel->window());
+}
+
+unsigned int SingleThreadScheduler::num_threads() const
+{
+ return 1;
+}
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index c99d59b..6f0da85 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -35,5 +35,5 @@
void INESimpleFunction::run()
{
_border_handler.run(_border_handler.window());
- NEScheduler::get().multithread(_kernel.get());
+ NEScheduler::get().schedule(_kernel.get(), Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
new file mode 100644
index 0000000..a24429c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEBatchNormalizationLayer::NEBatchNormalizationLayer()
+ : _norm_kernel()
+{
+}
+
+void NEBatchNormalizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+{
+ // Configure kernel
+ _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+}
+
+void NEBatchNormalizationLayer::run()
+{
+ NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 2d7ad86..26f31f5 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -153,10 +153,10 @@
_border_mag_gradient.run(_border_mag_gradient.window());
// Run gradient
- NEScheduler::get().multithread(_gradient.get());
+ NEScheduler::get().schedule(_gradient.get(), Window::DimY);
// Run non-maxima suppression
- NEScheduler::get().multithread(&_non_max_suppr);
+ NEScheduler::get().schedule(&_non_max_suppr, Window::DimY);
ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
memset(_output->buffer(), 0, _output->info()->total_size());
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index c2b3d7a..3f39ae2 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -98,12 +98,12 @@
if(_is_separable)
{
- NEScheduler::get().multithread(&_kernel_hor);
- NEScheduler::get().multithread(&_kernel_vert);
+ NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
+ NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
}
else
{
- NEScheduler::get().multithread(&_kernel);
+ NEScheduler::get().schedule(&_kernel, Window::DimY);
}
}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index aae4a67..bd688cf 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -33,33 +33,93 @@
using namespace arm_compute;
-NEConvolutionLayer::NEConvolutionLayer()
- : _input_im2col_kernel(), _input_interleave_kernel(), _weights_reshape_kernel(), _weights_transposed_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
- _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _is_first_run(false), _has_bias(false)
+NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights()
+ : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
{
}
-void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
if(biases != nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
}
- _has_bias = (biases != nullptr);
- _is_first_run = true;
+ // Check if bias are present, if yes they will be embedded to the weights matrix
+ const bool _has_bias = (biases != nullptr);
- // Get parameters for conv_info
+ _transpose1xW = transpose1xW;
+
+ if(transpose1xW)
+ {
+ // Create tensor to store the reshaped weights
+ const unsigned int mat_weights_cols = weights->info()->dimension(3);
+ const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+ TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+ TensorInfo info_wr(shape_wr, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+
+ _weights_reshaped.allocator()->init(info_wr);
+ _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+ _weights_transposed_kernel.configure(&_weights_reshaped, output);
+ _weights_reshaped.allocator()->allocate();
+ }
+ else
+ {
+ _weights_reshape_kernel.configure(weights, biases, output);
+ }
+}
+
+void NEConvolutionLayerReshapeWeights::run()
+{
+ NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+ if(_transpose1xW)
+ {
+ NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
+ }
+}
+
+NEConvolutionLayer::NEConvolutionLayer()
+ : _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
+ _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+{
+}
+
+void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, output);
+ ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+ ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
+ ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+ }
+
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
+ _has_bias = (biases != nullptr);
+ _are_weights_reshaped = weights_info.are_reshaped();
+
+ // Get parameters from conv_info
unsigned int stride_x = 0;
unsigned int stride_y = 0;
unsigned int pad_x = 0;
@@ -70,21 +130,46 @@
// Get convolved dimensions
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+
+ const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size() : weights->info()->dimension(0);
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
stride_x, stride_y, pad_x, pad_y, conv_info.round());
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
- // Create tensor to store the reshaped weights
- const unsigned int mat_weights_cols = weights->info()->dimension(3);
- const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
- TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
- TensorInfo info_wr(shape_wr, 1, weights->info()->data_type());
- _weights_reshaped.allocator()->init(info_wr);
+ // Check if its a "fully connected" convolution
+ _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
- // Create tensor to store transposed weights
- TensorShape shape_wt(mat_weights_rows * 4, static_cast<unsigned int>(std::ceil(mat_weights_cols / 4.f)));
- TensorInfo info_wt(shape_wt, 1, weights->info()->data_type());
- _weights_transposed.allocator()->init(info_wt);
+ unsigned int mat_weights_cols = weights->info()->dimension(3);
+ unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+
+ // Reshape weights if needed
+ if(_are_weights_reshaped)
+ {
+ mat_weights_cols = output->info()->dimension(2);
+ const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
+ mat_weights_rows = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+ }
+ else
+ {
+ if(_is_fully_connected_convolution)
+ {
+ // Create tensor to store the reshaped weights
+ TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+ TensorInfo info_wr(shape_wr, 1, dt, fixed_point_position);
+ _weights_reshaped.allocator()->init(info_wr);
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+ }
+ else
+ {
+ // Create tensor to store transposed weights
+ const float transpose_width = 16.0f / input->info()->element_size();
+ TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _weights_reshaped.allocator()->init(info_wt);
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
+ }
+ weights = &_weights_reshaped;
+ }
// Create tensor to store im2col reshaped inputs
const unsigned int mat_input_cols = mat_weights_rows;
@@ -93,58 +178,69 @@
shape_im2col.set(0, mat_input_cols);
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
- TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type());
- _input_im2col_reshaped.allocator()->init(info_im2col);
+ _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
- // Create tensor to prepare input tensor for GEMM
- TensorShape shape_interleaved = shape_im2col;
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
- TensorInfo info_interleaved(shape_interleaved, 1, input->info()->data_type());
- _input_interleaved_reshaped.allocator()->init(info_interleaved);
+ // Create tensor (interleave) to prepare input tensor for GEMM
+ if(!_is_fully_connected_convolution)
+ {
+ TensorShape shape_interleaved = shape_im2col;
+ shape_interleaved.set(0, shape_interleaved.x() * 4);
+ shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+ _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+ }
// Create GEMM output tensor
TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, mat_input_rows);
- TensorInfo info_gemm(shape_gemm, 1, input->info()->data_type());
- _gemm_output.allocator()->init(info_gemm);
+ _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
// Configure kernels
_input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
- _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
- _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
- _weights_transposed_kernel.configure(&_weights_reshaped, &_weights_transposed);
- _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, &_gemm_output, 1.0f);
+ if(_is_fully_connected_convolution)
+ {
+ _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+ }
+ else
+ {
+ _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+ _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+ }
_output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
- // Allocate the tensors once the all configure methods have been called
- _weights_reshaped.allocator()->allocate();
- _weights_transposed.allocator()->allocate();
+ // Allocate intermediate tensor
+ if(!_are_weights_reshaped)
+ {
+ _weights_reshaped.allocator()->allocate();
+ }
_input_im2col_reshaped.allocator()->allocate();
- _input_interleaved_reshaped.allocator()->allocate();
+ if(!_is_fully_connected_convolution)
+ {
+ _input_interleaved_reshaped.allocator()->allocate();
+ }
_gemm_output.allocator()->allocate();
}
void NEConvolutionLayer::run()
{
// Run weights reshaping (Runs once for every configure)
- if(_is_first_run)
+ if(!_are_weights_reshaped)
{
- _is_first_run = false;
- NEScheduler::get().multithread(&_weights_reshape_kernel, 3);
- NEScheduler::get().multithread(&_weights_transposed_kernel);
+ _are_weights_reshaped = true;
+ _reshape_weights.run();
}
// Run input reshaping
- NEScheduler::get().multithread(&_input_im2col_kernel);
+ NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+ if(!_is_fully_connected_convolution)
+ {
+ // Run interleave
+ NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
+ }
- // Run interleave
- NEScheduler::get().multithread(&_input_interleave_kernel);
-
- // Runs GEMM on reshaped matrices
- NEScheduler::get().multithread(&_mm_kernel);
+ // Runs matrix multiply on reshaped matrices
+ NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
// Reshape output matrix
- NEScheduler::get().multithread(&_output_col2im_kernel);
+ NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
new file mode 100644
index 0000000..7d2c549
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEDepthConcatenate::NEDepthConcatenate()
+ : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+{
+}
+
+void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+ _num_inputs = inputs_vector.size();
+ _concat_kernels_vector = arm_compute::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
+ _border_handlers_vector = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+
+ unsigned int depth_offset = 0;
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+ _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+ depth_offset += inputs_vector.at(i)->info()->dimension(2);
+ }
+}
+
+void NEDepthConcatenate::run()
+{
+ for(unsigned i = 0; i < _num_inputs; ++i)
+ {
+ NEScheduler::get().schedule(&_border_handlers_vector[i], Window::DimX);
+ NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimX);
+ }
+}
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvert.cpp
index 5f3594a..a339cae 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvert.cpp
@@ -33,8 +33,8 @@
void NEDepthConvert::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::U32, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::U16, DataType::S16, DataType::U32, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
ARM_COMPUTE_ERROR_ON(input == output);
ARM_COMPUTE_ERROR_ON(input->info()->data_type() == output->info()->data_type());
diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp
index 340e1ce..2887c13 100644
--- a/src/runtime/NEON/functions/NEDerivative.cpp
+++ b/src/runtime/NEON/functions/NEDerivative.cpp
@@ -48,5 +48,5 @@
void NEDerivative::run()
{
_border_handler.run(_border_handler.window());
- NEScheduler::get().multithread(&_kernel);
+ NEScheduler::get().schedule(&_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
new file mode 100644
index 0000000..3f3e771
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NEDirectConvolutionLayer::NEDirectConvolutionLayer()
+ : _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
+{
+}
+
+void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+
+ // Free accumulator
+ if(_accumulator.buffer() != nullptr)
+ {
+ _accumulator.allocator()->free();
+ }
+
+ // Allocate the intermediate accumulator tensor in case of fixed point input
+ if(output->info()->data_type() == DataType::QS8)
+ {
+ _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
+ _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+ _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+ _accumulator.allocator()->allocate();
+ }
+ else
+ {
+ _conv_kernel.configure(input, weights, output, conv_info);
+ _accumulate_bias_kernel.configure(output, bias);
+ }
+
+ // Add zero padding XY
+ _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+void NEDirectConvolutionLayer::run()
+{
+ _input_border_handler.run(_input_border_handler.window());
+
+ NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
+ NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
index a8b132d..f6ec677 100644
--- a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
+++ b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
@@ -52,11 +52,11 @@
void NEEqualizeHistogram::run()
{
// Calculate histogram of input.
- NEScheduler::get().multithread(&_histogram_kernel);
+ NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
// Calculate cumulative distribution of histogram and create LUT.
_cd_histogram_kernel.run(_cd_histogram_kernel.window());
// Map input to output using created LUT.
- NEScheduler::get().multithread(&_map_histogram_kernel);
+ NEScheduler::get().schedule(&_map_histogram_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index 670b4d4..33a58f1 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -90,12 +90,12 @@
{
_border_handler.run(_border_handler.window());
- NEScheduler::get().multithread(&_fast_corners_kernel);
+ NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
if(_non_max)
{
- NEScheduler::get().multithread(&_nonmax_kernel);
+ NEScheduler::get().schedule(&_nonmax_kernel, Window::DimY);
}
- NEScheduler::get().multithread(&_fill_kernel);
+ NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index 7ff8f2f..e884f4a 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -35,5 +35,5 @@
void NEFillBorder::run()
{
- NEScheduler::get().multithread(&_border_handler, Window::DimZ);
+ NEScheduler::get().schedule(&_border_handler, Window::DimZ);
}
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index e6785b3..abb41e9 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -31,35 +31,99 @@
using namespace arm_compute;
+NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights()
+ : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+{
+}
+
+void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
+ ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
+ _transpose_weights = transpose_weights;
+ _is_batched_fc_layer = is_batched_fc_layer;
+
+ // Check if we need to transpose the weights
+ if(_transpose_weights)
+ {
+ if(_is_batched_fc_layer)
+ {
+ // Initialize the output tensor for transpose
+ TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
+ _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+ _transpose_kernel.configure(input, &_transpose_output);
+
+ // Configure transpose 1xW kernel
+ _transpose1xW_kernel.configure(&_transpose_output, output);
+
+ // Allocate temporary tensor used for transposing the weights
+ _transpose_output.allocator()->allocate();
+ }
+ else
+ {
+ _transpose_kernel.configure(input, output);
+ }
+ }
+ else
+ {
+ if(_is_batched_fc_layer)
+ {
+ // Configure transpose 1xW kernel
+ _transpose1xW_kernel.configure(input, output);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
+ }
+ }
+}
+
+void NEFullyConnectedLayerReshapeWeights::run()
+{
+ if(_transpose_weights)
+ {
+ NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+ }
+ if(_is_batched_fc_layer)
+ {
+ NEScheduler::get().schedule(&_transpose1xW_kernel, Window::DimY);
+ }
+}
+
NEFullyConnectedLayer::NEFullyConnectedLayer()
- : _im2col_kernel(), _transpose_kernel(), _transpose1xW_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _transpose_output(),
- _transpose1xW_output(), _is_first_run(true), _transpose_weights(true), _fc_after_conv(false), _batched_fc_layer(false), _accumulate_biases(false)
+ : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
+ _are_weights_reshaped(false), _is_fc_after_conv(false), _is_batched_fc_layer(false), _accumulate_biases(false)
{
}
void NEFullyConnectedLayer::configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
// Initialize output tensor for im2col
TensorShape shape_im2col;
- shape_im2col.set(0, weights->info()->dimension(1));
+ shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
shape_im2col.set(1, input->info()->dimension(3));
shape_im2col.set(2, input->info()->dimension(4));
shape_im2col.set(3, input->info()->dimension(5));
- _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+ _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
// Initialize output tensor for interleave 4x4
TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
shape_interleaved.set(0, shape_interleaved.x() * 4);
shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
-
- // Initialize output tensor for transpose 1xW
- TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
- _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+ _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
// Configure im2col kernel
_im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
@@ -67,55 +131,49 @@
// Configure interleave4x4 kernel
_interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
- // Configure transpose 1xW kernel
- _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
-
// Configure matrix multiply kernel
- _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+ _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
// Allocate the tensors once all the configure methods have been called
_im2col_output.allocator()->allocate();
_interleave4x4_output.allocator()->allocate();
- _transpose1xW_output.allocator()->allocate();
}
void NEFullyConnectedLayer::configure_fc_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
{
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
// Initialize output tensor for interleave 4x4
TensorShape shape_interleaved = input->info()->tensor_shape();
shape_interleaved.set(0, shape_interleaved.x() * 4);
shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
-
- // Initialize output tensor for transpose 1xW
- TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
- _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+ _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
// Configure interleave4x4 kernel
_interleave4x4_kernel.configure(input, &_interleave4x4_output);
- // Configure transpose 1xW kernel
- _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
-
// Configure matrix multiply kernel
- _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+ _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
// Allocate the tensors once all the configure methods have been called
_interleave4x4_output.allocator()->allocate();
- _transpose1xW_output.allocator()->allocate();
}
void NEFullyConnectedLayer::configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
{
ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
// Initialize output tensor for im2col
TensorShape shape_im2col;
- shape_im2col.set(0, weights->info()->dimension(1));
+ shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
shape_im2col.set(1, 1);
- _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+ _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
// Configure im2col kernel
_im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
@@ -135,20 +193,21 @@
_mm_kernel.configure(input, weights, output, 1.0f);
}
-void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights)
+void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights, bool are_weights_reshaped)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
- _is_first_run = true;
- _transpose_weights = transpose_weights;
- _fc_after_conv = true;
- _batched_fc_layer = false;
- _accumulate_biases = false;
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
- const ITensor *weights_to_use = weights;
+ _are_weights_reshaped = are_weights_reshaped;
+ _is_fc_after_conv = true;
+ _is_batched_fc_layer = false;
+ _accumulate_biases = false;
if(biases != nullptr)
{
@@ -160,17 +219,6 @@
_accumulate_biases_kernel.configure(output, biases);
}
- // Check if we need to transpose the weights
- if(_transpose_weights)
- {
- // Initialize the output tensor for transpose
- TensorShape shape_transposed(weights->info()->dimension(1), weights->info()->dimension(0));
- _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, weights->info()->data_type()));
- _transpose_kernel.configure(weights, &_transpose_output);
-
- weights_to_use = &_transpose_output;
- }
-
// With the Fully Connected layer we can have 4 different cases:
// 1) Convolution layer -> Fully Connected layer without batches
// 2) Fully Connected layer -> Fully Connected layer without batches
@@ -178,15 +226,54 @@
// 4) Fully Connected layer -> Fully Connected layer with batches
// Check if we have a fully connected layer with batches
- _batched_fc_layer = (output->info()->dimension(1) > 1);
+ _is_batched_fc_layer = (output->info()->dimension(1) > 1);
- if(_batched_fc_layer)
+ const ITensor *weights_to_use = weights;
+
+ if(!are_weights_reshaped)
{
- _fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
- input->info()->tensor_shape().cend(),
- output->info()->tensor_shape().cbegin() + 1));
+ if((transpose_weights || _is_batched_fc_layer))
+ {
+ weights_to_use = &_reshape_weights_output;
- if(_fc_after_conv)
+ if(transpose_weights)
+ {
+ if(_is_batched_fc_layer)
+ {
+ const float transpose_width = 16.0f / input->info()->element_size();
+ TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _reshape_weights_output.allocator()->init(info_wt);
+ }
+ else
+ {
+ TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _reshape_weights_output.allocator()->init(info_wt);
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
+
+ const float transpose_width = 16.0f / input->info()->element_size();
+ TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _reshape_weights_output.allocator()->init(info_wt);
+ }
+
+ // Reshape the weights
+ _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+ }
+ }
+
+ if(_is_batched_fc_layer)
+ {
+ _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+ input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1));
+
+ if(_is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer with batches
configure_conv_fc_wb(input, weights_to_use, output);
@@ -199,9 +286,10 @@
}
else
{
- _fc_after_conv = (weights_to_use->info()->dimension(1) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+ // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
+ _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
- if(_fc_after_conv)
+ if(_is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer without batches
configure_conv_fc_nb(input, weights_to_use, output);
@@ -213,47 +301,44 @@
}
}
- // Allocate the transpose tensor if the transpose_weights flag is true and once all the configure methods have been called
- if(_transpose_weights)
+ // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
+ if(!are_weights_reshaped)
{
- _transpose_output.allocator()->allocate();
+ if(transpose_weights || _is_batched_fc_layer)
+ {
+ // Allocate the tensor for the weights reshaped
+ _reshape_weights_output.allocator()->allocate();
+ }
}
}
void NEFullyConnectedLayer::run()
{
// Reshape of the weights (happens only once)
- if(_is_first_run)
+ if(!_are_weights_reshaped)
{
- _is_first_run = false;
- if(_transpose_weights)
- {
- NEScheduler::get().multithread(&_transpose_kernel);
- }
- if(_batched_fc_layer)
- {
- NEScheduler::get().multithread(&_transpose1xW_kernel);
- }
+ _are_weights_reshaped = true;
+ _reshape_weights_kernel.run();
}
// Linearize input if comes from a convolutional layer
- if(_fc_after_conv)
+ if(_is_fc_after_conv)
{
- NEScheduler::get().multithread(&_im2col_kernel);
+ NEScheduler::get().schedule(&_im2col_kernel, Window::DimY);
}
// Interleave input
- if(_batched_fc_layer)
+ if(_is_batched_fc_layer)
{
- NEScheduler::get().multithread(&_interleave4x4_kernel);
+ NEScheduler::get().schedule(&_interleave4x4_kernel, Window::DimY);
}
// Run matrix multiply
- NEScheduler::get().multithread(&_mm_kernel);
+ NEScheduler::get().schedule(&_mm_kernel, _is_batched_fc_layer ? Window::DimY : Window::DimX);
// Accumulate biases if provided
if(_accumulate_biases)
{
- NEScheduler::get().multithread(&_accumulate_biases_kernel);
+ NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
}
}
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index f155dd5..15d5f4e 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -43,16 +43,16 @@
void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(d, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(d, 1, DataType::F32, DataType::F16, DataType::QS8);
if(c != nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
+ ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != d->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
}
@@ -60,8 +60,8 @@
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- // Check if the first input tensor is a vector and the data type is F32. If so, all the kernels for reshaping the tensors can be skipped
- if((a->info()->dimension(1) == 1) && (a->info()->data_type() == DataType::F32))
+ // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
+ if((a->info()->dimension(1) == 1))
{
_run_vector_matrix_multiplication = true;
@@ -94,14 +94,20 @@
break;
}
#endif
+ case DataType::QS8:
+ {
+ shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.0f));
+ break;
+ }
default:
{
ARM_COMPUTE_ERROR_ON("Data type not supported");
}
}
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+ TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
+ TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
_tmp_a.allocator()->init(info_a);
_tmp_b.allocator()->init(info_b);
@@ -133,18 +139,18 @@
if(!_run_vector_matrix_multiplication)
{
// Run interleave kernel
- NEScheduler::get().multithread(&_interleave_kernel);
+ NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
// Run transpose kernel
- NEScheduler::get().multithread(&_transpose_kernel);
+ NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
}
// Run matrix multiply kernel
- NEScheduler::get().multithread(&_mm_kernel, _run_vector_matrix_multiplication ? 0 : 1);
+ NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
// Run matrix addition kernel
if(_run_addition)
{
- NEScheduler::get().multithread(&_ma_kernel);
+ NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
}
}
diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp
index 3866f28..b64f769 100644
--- a/src/runtime/NEON/functions/NEGEMMLowp.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowp.cpp
@@ -49,14 +49,14 @@
ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
- /* The interleaved output matrix will have the following shape: [ a_height * 4, a_width / 4 ] */
+ /* The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] */
TensorShape shape_tmp_a = a->info()->tensor_shape();
shape_tmp_a.set(0, a->info()->dimension(0) * 4);
shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
TensorShape shape_tmp_b = b->info()->tensor_shape();
- shape_tmp_b.set(0, b->info()->dimension(1) * 4);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.f));
+ shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
@@ -74,11 +74,11 @@
void NEGEMMLowp::run()
{
/* Run interleave kernel */
- NEScheduler::get().multithread(&_interleave_kernel);
+ NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
/* Run transpose kernel */
- NEScheduler::get().multithread(&_transpose_kernel);
+ NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
/* Run matrix multiply kernel */
- NEScheduler::get().multithread(&_mm_kernel);
+ NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
index 8cba30d..dc40ece 100644
--- a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
@@ -34,11 +34,6 @@
void NEGEMMTranspose1xW::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::U8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(1) * 4);
- ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(input->info()->dimension(0) / 4.0f));
auto k = arm_compute::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
k->configure(input, output);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index 34447b1..5ccc765 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -55,6 +55,6 @@
void NEGaussian5x5::run()
{
_border_handler.run(_border_handler.window());
- NEScheduler::get().multithread(&_kernel_hor);
- NEScheduler::get().multithread(&_kernel_vert);
+ NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
+ NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index cb8296b..e1d64f1 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -108,8 +108,8 @@
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
_border_handler[i].run(_border_handler[i].window());
- NEScheduler::get().multithread(_horizontal_reduction.get() + i);
- NEScheduler::get().multithread(_vertical_reduction.get() + i);
+ NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
+ NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
}
}
@@ -178,6 +178,6 @@
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
_gaus5x5[i].run();
- NEScheduler::get().multithread(_scale_nearest.get() + i);
+ NEScheduler::get().schedule(_scale_nearest.get() + i, Window::DimY);
}
}
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index a5073b9..a592f53 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -92,8 +92,8 @@
_gradient.run();
// Run orientation binning kernel
- NEScheduler::get().multithread(&_orient_bin);
+ NEScheduler::get().schedule(&_orient_bin, Window::DimY);
// Run block normalization kernel
- NEScheduler::get().multithread(&_block_norm);
+ NEScheduler::get().schedule(&_block_norm, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEHOGDetector.cpp b/src/runtime/NEON/functions/NEHOGDetector.cpp
index f0d6121..e8ed29d 100644
--- a/src/runtime/NEON/functions/NEHOGDetector.cpp
+++ b/src/runtime/NEON/functions/NEHOGDetector.cpp
@@ -31,8 +31,6 @@
void NEHOGDetector::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
{
auto k = arm_compute::cpp14::make_unique<NEHOGDetectorKernel>();
-
k->configure(input, hog, detection_windows, detection_window_stride, threshold, idx_class);
-
_kernel = std::move(k);
-}
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index c5b37f4..2f4b880 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -76,5 +76,5 @@
_derivative.run();
// Run magnitude/phase kernel
- NEScheduler::get().multithread(_mag_phase.get());
+ NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index effa64f..173b8f4 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -115,7 +115,7 @@
_orient_bin_kernel = arm_compute::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
_block_norm_kernel = arm_compute::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
_hog_detect_kernel = arm_compute::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
- _non_maxima_kernel = arm_compute::cpp14::make_unique<NEHOGNonMaximaSuppressionKernel>();
+ _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
_hog_space = arm_compute::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
_hog_norm_space = arm_compute::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
@@ -208,13 +208,13 @@
// Run orientation binning kernel
for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
{
- NEScheduler::get().multithread(_orient_bin_kernel.get() + i);
+ NEScheduler::get().schedule(_orient_bin_kernel.get() + i, Window::DimY);
}
// Run block normalization kernel
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
{
- NEScheduler::get().multithread(_block_norm_kernel.get() + i);
+ NEScheduler::get().schedule(_block_norm_kernel.get() + i, Window::DimY);
}
// Run HOG detector kernel
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index adefd47..b54fb67 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -199,13 +199,13 @@
_border_gy.run(_border_gy.window());
// Run harris score kernel
- NEScheduler::get().multithread(_harris_score.get());
+ NEScheduler::get().schedule(_harris_score.get(), Window::DimY);
// Run non-maxima suppression
_non_max_suppr.run();
// Run corner candidate kernel
- NEScheduler::get().multithread(&_candidates);
+ NEScheduler::get().schedule(&_candidates, Window::DimY);
// Run sort & euclidean distance
_sort_euclidean.run(_sort_euclidean.window());
diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
index 6747f2e..c42b2a5 100644
--- a/src/runtime/NEON/functions/NEHistogram.cpp
+++ b/src/runtime/NEON/functions/NEHistogram.cpp
@@ -54,5 +54,5 @@
void NEHistogram::run()
{
// Calculate histogram of input.
- NEScheduler::get().multithread(&_histogram_kernel);
+ NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
new file mode 100644
index 0000000..85d7ba3
--- /dev/null
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NELocallyConnectedLayer::NELocallyConnectedLayer()
+ : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+{
+}
+
+void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+ ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
+ }
+
+ bool _has_bias = (biases != nullptr);
+ _is_first_run = true;
+
+ // Get parameters for conv_info
+ unsigned int stride_x = 0;
+ unsigned int stride_y = 0;
+ unsigned int pad_x = 0;
+ unsigned int pad_y = 0;
+ std::tie(stride_x, stride_y) = conv_info.stride();
+ std::tie(pad_x, pad_y) = conv_info.pad();
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+ stride_x, stride_y, pad_x, pad_y, conv_info.round());
+
+ ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+ ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+ // Create tensor to store the reshaped weights
+ const size_t mat_weights_cols = weights->info()->dimension(3);
+ const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+ const size_t mat_weights_num = weights->info()->dimension(4);
+
+ const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+ _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+
+ // Create tensor to store im2col reshaped inputs
+ const size_t mat_input_cols = mat_weights_rows;
+ const size_t mat_input_rows = conv_w * conv_h;
+ TensorShape shape_im2col = input->info()->tensor_shape();
+ shape_im2col.set(0, mat_input_cols);
+ shape_im2col.set(1, mat_input_rows);
+ shape_im2col.set(2, 1);
+
+ _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+ // Create locally connected layer output tensor
+ TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, mat_input_rows);
+ _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+ // Configure kernels
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+ _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+ _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+ _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+ // Allocate intermediate tensors
+ _weights_reshaped.allocator()->allocate();
+ _input_im2col_reshaped.allocator()->allocate();
+ _gemm_output.allocator()->allocate();
+}
+
+void NELocallyConnectedLayer::run()
+{
+ // Run weights reshaping (Runs once for every configure)
+ if(_is_first_run)
+ {
+ _is_first_run = false;
+ NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+ }
+
+ // Run input reshaping
+ NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+
+ // Runs GEMM on reshaped matrices
+ NEScheduler::get().schedule(&_mm_kernel, Window::DimX);
+
+ // Reshape output matrix
+ NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp
index 3fb5769..47143f5 100644
--- a/src/runtime/NEON/functions/NEMeanStdDev.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDev.cpp
@@ -43,5 +43,5 @@
_global_sum = 0;
_global_sum_squared = 0;
- NEScheduler::get().multithread(&_mean_stddev_kernel);
+ NEScheduler::get().schedule(&_mean_stddev_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEMinMaxLocation.cpp b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
index ba73ef9..cab9200 100644
--- a/src/runtime/NEON/functions/NEMinMaxLocation.cpp
+++ b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
@@ -43,8 +43,8 @@
_min_max.reset();
/* Run min max kernel */
- NEScheduler::get().multithread(&_min_max);
+ NEScheduler::get().schedule(&_min_max, Window::DimY);
/* Run min max location */
- NEScheduler::get().multithread(&_min_max_loc);
+ NEScheduler::get().schedule(&_min_max_loc, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index ff38e61..69ff325 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -41,7 +41,7 @@
{
ARM_COMPUTE_ERROR_ON(input == nullptr);
- TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type());
+ TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
_input_squared.allocator()->init(tensor_info);
// Configure kernels
@@ -55,7 +55,7 @@
void NENormalizationLayer::run()
{
- NEScheduler::get().multithread(&_multiply_kernel);
- NEScheduler::get().multithread(&_border_handler);
- NEScheduler::get().multithread(&_norm_kernel);
+ NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_border_handler, Window::DimY);
+ NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index 993153b..49135e4 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -113,7 +113,7 @@
// Run Scharr kernel
_func_scharr[level - 1].run();
- /* Run Lucas-Kanade kernel */
- NEScheduler::get().multithread(_kernel_tracker.get() + level - 1, Window::DimX);
+ // Run Lucas-Kanade kernel
+ NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX);
}
}
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index 1859b30..8967a22 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -76,6 +76,6 @@
void NESobel5x5::run()
{
_border_handler.run(_border_handler.window());
- NEScheduler::get().multithread(&_sobel_hor);
- NEScheduler::get().multithread(&_sobel_vert);
+ NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
+ NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index 8af5e8d..f628da9 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -76,6 +76,6 @@
void NESobel7x7::run()
{
_border_handler.run(_border_handler.window());
- NEScheduler::get().multithread(&_sobel_hor);
- NEScheduler::get().multithread(&_sobel_vert);
+ NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
+ NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 55d4d3a..0651eab 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -32,21 +32,22 @@
using namespace arm_compute;
NESoftmaxLayer::NESoftmaxLayer()
- : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _fill_border_kernel_sum(), _max(), _sum(), _tmp()
+ : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
{
}
void NESoftmaxLayer::configure(ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
// Create intermediate tensors shapes
- _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
+ TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
+ _tmp.allocator()->init(tensor_info_tmp);
TensorShape shape = input->info()->tensor_shape();
shape.set(0, 1);
- TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
+ TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
_max.allocator()->init(tensor_info_max_sum);
_sum.allocator()->init(tensor_info_max_sum);
@@ -55,9 +56,6 @@
_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
_norm_kernel.configure(&_tmp, &_sum, output);
_fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-FLT_MAX));
- // Fill the border around tmp buffer with sensible negative value.
- // This avoids exp(-FLT_MAX) which will lead to -inf and destroy the calculation of sum when input is not a multiple of processed elements
- _fill_border_kernel_sum.configure(input, _shift_exp_sum_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-50.f));
// Allocate intermediate tensors
_tmp.allocator()->allocate();
@@ -67,9 +65,8 @@
void NESoftmaxLayer::run()
{
- NEScheduler::get().multithread(&_fill_border_kernel);
- NEScheduler::get().multithread(&_max_kernel);
- NEScheduler::get().multithread(&_fill_border_kernel_sum);
- NEScheduler::get().multithread(&_shift_exp_sum_kernel);
- NEScheduler::get().multithread(&_norm_kernel);
+ NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_max_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_shift_exp_sum_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
}
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
new file mode 100644
index 0000000..0cced73
--- /dev/null
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/OMP/OMPScheduler.h"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+
+#include <omp.h>
+
+using namespace arm_compute;
+
+OMPScheduler &OMPScheduler::get()
+{
+ static OMPScheduler scheduler;
+ return scheduler;
+}
+
+OMPScheduler::OMPScheduler()
+ : _num_threads(omp_get_max_threads())
+{
+}
+
+unsigned int OMPScheduler::num_threads() const
+{
+ return _num_threads;
+}
+
+void OMPScheduler::set_num_threads(unsigned int num_threads)
+{
+ const unsigned int num_cores = omp_get_max_threads();
+ _num_threads = num_threads == 0 ? num_cores : num_threads;
+}
+
+void OMPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
+
+ const Window &max_window = kernel->window();
+ const unsigned int num_iterations = max_window.num_iterations(split_dimension);
+ const unsigned int num_threads = std::min(num_iterations, _num_threads);
+
+ if(!kernel->is_parallelisable() || 1 == num_threads)
+ {
+ kernel->run(max_window);
+ }
+ else
+ {
+ #pragma omp parallel num_threads(num_threads)
+ {
+ #pragma omp for
+ for(unsigned int t = 0; t < num_threads; ++t)
+ {
+ Window win = max_window.split_window(split_dimension, t, num_threads);
+ win.set_thread_id(t);
+ win.set_num_threads(num_threads);
+ kernel->run(win);
+ }
+ }
+ }
+}
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
new file mode 100644
index 0000000..a131928
--- /dev/null
+++ b/src/runtime/Scheduler.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Scheduler.h"
+
+#include "arm_compute/core/Error.h"
+#if ARM_COMPUTE_CPP_SCHEDULER
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+#endif
+
+#include "arm_compute/runtime/SingleThreadScheduler.h"
+
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+#include "arm_compute/runtime/OMP/OMPScheduler.h"
+#endif
+
+using namespace arm_compute;
+
+#if !ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::OMP;
+#elif ARM_COMPUTE_CPP_SCHEDULER && !ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
+#elif ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
+#else
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
+#endif
+
+void Scheduler::set(Type t)
+{
+ ARM_COMPUTE_ERROR_ON(!Scheduler::is_available(t));
+ _scheduler_type = t;
+}
+
+bool Scheduler::is_available(Type t)
+{
+ switch(t)
+ {
+ case Type::ST:
+ {
+ return true;
+ }
+ case Type::CPP:
+ {
+#if ARM_COMPUTE_CPP_SCHEDULER
+ return true;
+#else
+ return false;
+#endif
+ }
+ case Type::OMP:
+ {
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+ return true;
+#else
+ return false;
+#endif
+ }
+ case Type::CUSTOM:
+ {
+ return _custom_scheduler != nullptr;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Invalid Scheduler type");
+ return false;
+ }
+ }
+}
+
+Scheduler::Type Scheduler::get_type()
+{
+ return _scheduler_type;
+}
+
+IScheduler &Scheduler::get()
+{
+ switch(_scheduler_type)
+ {
+ case Type::ST:
+ {
+ return SingleThreadScheduler::get();
+ }
+ case Type::CPP:
+ {
+#if ARM_COMPUTE_CPP_SCHEDULER
+ return CPPScheduler::get();
+#else
+ ARM_COMPUTE_ERROR("Recompile with cppthreads=1 to use C++11 scheduler.");
+#endif
+ break;
+ }
+ case Type::OMP:
+ {
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+ return OMPScheduler::get();
+#else
+ ARM_COMPUTE_ERROR("Recompile with openmp=1 to use openmp scheduler.");
+#endif
+ break;
+ }
+ case Type::CUSTOM:
+ {
+ if(_custom_scheduler == nullptr)
+ {
+ ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) before Scheduler::get()");
+ }
+ else
+ {
+ return *_custom_scheduler;
+ }
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Invalid Scheduler type");
+ break;
+ }
+ }
+ return SingleThreadScheduler::get();
+}
+
+std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
+
+void Scheduler::set(std::shared_ptr<IScheduler> &scheduler)
+{
+ _custom_scheduler = scheduler;
+ set(Type::CUSTOM);
+}
diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
new file mode 100644
index 0000000..32924be
--- /dev/null
+++ b/src/runtime/SubTensor.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/SubTensor.h"
+
+#include "arm_compute/core/Error.h"
+
+using namespace arm_compute;
+
+SubTensor::SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+ : _parent(nullptr), _info()
+{
+ ARM_COMPUTE_ERROR_ON(parent == nullptr);
+ _info = SubTensorInfo(parent->info(), tensor_shape, coords);
+ _parent = parent;
+}
+
+ITensorInfo *SubTensor::info() const
+{
+ return &_info;
+}
+
+ITensorInfo *SubTensor::info()
+{
+ return &_info;
+}
+
+uint8_t *SubTensor::buffer() const
+{
+ ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+ return _parent->buffer();
+}
+
+ITensor *SubTensor::parent()
+{
+ return _parent;
+}
diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
index 617e7a8..435068c 100644
--- a/src/runtime/Tensor.cpp
+++ b/src/runtime/Tensor.cpp
@@ -30,12 +30,12 @@
{
}
-TensorInfo *Tensor::info() const
+ITensorInfo *Tensor::info() const
{
return &_allocator.info();
}
-TensorInfo *Tensor::info()
+ITensorInfo *Tensor::info()
{
return &_allocator.info();
}
diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
new file mode 100644
index 0000000..1b06117
--- /dev/null
+++ b/src/runtime/Utils.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Utils.h"
+
+#include <map>
+#include <string>
+
+using namespace arm_compute;
+
+const std::string &arm_compute::string_from_scheduler_type(Scheduler::Type t)
+{
+ static std::map<Scheduler::Type, const std::string> scheduler_type_map =
+ {
+ { Scheduler::Type::ST, "Single Thread" },
+ { Scheduler::Type::CPP, "C++11 Threads" },
+ { Scheduler::Type::OMP, "OpenMP Threads" },
+ { Scheduler::Type::CUSTOM, "Custom" }
+ };
+
+ return scheduler_type_map[t];
+}