arm_compute v17.10
Change-Id: If1489af40eccd0219ede8946577afbf04db31b29
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index e165cf3..6e5e802 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -569,7 +569,7 @@
};
CLKernelLibrary::CLKernelLibrary()
- : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map(), _max_workgroup_size(0)
+ : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
{
}
@@ -709,19 +709,18 @@
return program_source_it->second;
}
-size_t CLKernelLibrary::max_local_workgroup_size()
+size_t CLKernelLibrary::max_local_workgroup_size(const cl::Kernel &kernel) const
{
- if(_max_workgroup_size == 0)
- {
- size_t err = clGetDeviceInfo(_device.get(), CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &_max_workgroup_size, nullptr);
- ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
- ARM_COMPUTE_UNUSED(err);
- }
+ size_t result;
- return _max_workgroup_size;
+ size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result);
+ ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+ ARM_COMPUTE_UNUSED(err);
+
+ return result;
}
-cl::NDRange CLKernelLibrary::default_ndrange()
+cl::NDRange CLKernelLibrary::default_ndrange() const
{
- return cl::NDRange(std::min<size_t>(_max_workgroup_size, 128u), 1);
+ return cl::NDRange(128u, 1);
}
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 1e04f00..17b58b7 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -52,18 +52,28 @@
(window.y().end() - window.y().start()) / window.y().step(),
(window.z().end() - window.z().start()) / window.z().step());
+ cl::NDRange valid_lws;
+ if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
+ {
+ valid_lws = cl::NullRange;
+ }
+ else
+ {
+ valid_lws = lws_hint;
+ }
+
cl::NDRange lws = cl::NullRange;
- if((lws_hint[0] <= gws[0]) && (lws_hint[1] <= gws[1]) && (lws_hint[2] <= gws[2]))
+ if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
{
- lws = lws_hint;
+ lws = valid_lws;
}
queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws);
}
ICLKernel::ICLKernel()
- : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id)
+ : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0)
{
}
@@ -163,3 +173,12 @@
{
return _target;
}
+
+size_t ICLKernel::get_max_workgroup_size()
+{
+ if(_max_workgroup_size == 0)
+ {
+ _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel);
+ }
+ return _max_workgroup_size;
+}
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 1d04f39..287c5e2 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -100,6 +100,8 @@
clGetDeviceInfo = reinterpret_cast<clGetDeviceInfo_func>(dlsym(handle, "clGetDeviceInfo"));
clGetDeviceIDs = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs"));
clRetainEvent = reinterpret_cast<clRetainEvent_func>(dlsym(handle, "clRetainEvent"));
+ clGetPlatformIDs = reinterpret_cast<clGetPlatformIDs_func>(dlsym(handle, "clGetPlatformIDs"));
+ clGetKernelWorkGroupInfo = reinterpret_cast<clGetKernelWorkGroupInfo_func>(dlsym(handle, "clGetKernelWorkGroupInfo"));
dlclose(handle);
@@ -632,3 +634,37 @@
return CL_OUT_OF_RESOURCES;
}
}
+
+cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint *num_platforms)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clGetPlatformIDs;
+ if(func != nullptr)
+ {
+ return func(num_entries, platforms, num_platforms);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
+
+cl_int
+clGetKernelWorkGroupInfo(cl_kernel kernel,
+ cl_device_id device,
+ cl_kernel_work_group_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo;
+ if(func != nullptr)
+ {
+ return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index b7423d8..f7aa5eb 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -44,7 +44,7 @@
/** Apply batch normalization.
*
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: QS8/QS16/F32
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: QS8/QS16/F16/F32
* @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
@@ -100,7 +100,7 @@
Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- _in = 0;
+ data = 0;
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
denominator = 0;
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -114,13 +114,13 @@
const int current_slice = get_global_id(2);
- _in = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
- denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(epsilon)));
+ denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
// Calculate x bar and store results
numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
- numerator = SUB_OP(_in, numerator);
+ numerator = SUB_OP(data, numerator);
x_bar = MUL_OP(numerator, denominator);
gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * beta.stride_x));
diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl
index f5ec85a..01d8b90 100644
--- a/src/core/CL/cl_kernels/color_convert.cl
+++ b/src/core/CL/cl_kernels/color_convert.cl
@@ -54,7 +54,7 @@
uchar16 rgb_2 = vload16(0, in.ptr + 32);
uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255);
- uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.f, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255);
+ uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.sf, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255);
uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255);
uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255);
diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/direct_convolution1x1.cl
index fb516dd..7b73b85 100644
--- a/src/core/CL/cl_kernels/direct_convolution1x1.cl
+++ b/src/core/CL/cl_kernels/direct_convolution1x1.cl
@@ -191,7 +191,7 @@
weights.ptr += z_index * weights_stride_w;
- for(int d = 0; d < WEIGHTS_DEPTH; ++d)
+ for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
{
DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
VEC_DATA_TYPE(DATA_TYPE, 8)
diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/direct_convolution3x3.cl
index d094eca..1420d7c 100644
--- a/src/core/CL/cl_kernels/direct_convolution3x3.cl
+++ b/src/core/CL/cl_kernels/direct_convolution3x3.cl
@@ -138,7 +138,7 @@
const int kernel_index = get_global_id(2);
weights_addr += kernel_index * weights_stride_w;
- for(int d = 0; d < WEIGHTS_DEPTH; ++d)
+ for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
{
CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/direct_convolution5x5.cl
index 496da97..6fdd019 100644
--- a/src/core/CL/cl_kernels/direct_convolution5x5.cl
+++ b/src/core/CL/cl_kernels/direct_convolution5x5.cl
@@ -127,7 +127,7 @@
const int kernel_index = get_global_id(2);
weights_addr += kernel_index * weights_stride_w;
- for(int d = 0; d < WEIGHTS_DEPTH; ++d)
+ for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
{
CONVOLUTION1x5(pixels0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
CONVOLUTION1x5(pixels0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 68af64e..4421e74 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -25,6 +25,9 @@
#define ARM_COMPUTE_HELPER_H
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
#define EXPAND(x) x
diff --git a/src/core/CL/cl_kernels/warp_affine.cl b/src/core/CL/cl_kernels/warp_affine.cl
index 0a4748f..f41821c 100644
--- a/src/core/CL/cl_kernels/warp_affine.cl
+++ b/src/core/CL/cl_kernels/warp_affine.cl
@@ -84,7 +84,7 @@
{
Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
Image out = CONVERT_TO_IMAGE_STRUCT(out);
- vstore4(read_texels4(&in, convert_int8(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr);
+ vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr);
}
/** Performs an affine transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
@@ -116,5 +116,5 @@
{
Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
Image out = CONVERT_TO_IMAGE_STRUCT(out);
- vstore4(bilinear_interpolate(&in, clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), width, height), 0, out.ptr);
+ vstore4(bilinear_interpolate(&in, apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), 0, out.ptr);
}
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 18c0c97..43f39f4 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -45,7 +45,7 @@
void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
float epsilon)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
_input = input;
_output = output;
diff --git a/src/core/CL/kernels/CLFloorKernel.cpp b/src/core/CL/kernels/CLFloorKernel.cpp
index 6c9f83f..11f8e33 100644
--- a/src/core/CL/kernels/CLFloorKernel.cpp
+++ b/src/core/CL/kernels/CLFloorKernel.cpp
@@ -44,10 +44,8 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
- set_data_type_if_unknown(*input->info(), DataType::F32);
- set_data_type_if_unknown(*output->info(), DataType::F32);
+ // Auto initialize output
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp
index e549dbc..be095f2 100644
--- a/src/core/CL/kernels/CLWarpAffineKernel.cpp
+++ b/src/core/CL/kernels/CLWarpAffineKernel.cpp
@@ -88,8 +88,11 @@
Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowStatic output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+ int total_right = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration);
+ const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0);
+
+ AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
update_window_and_padding(win, input_access, output_access);
diff --git a/src/core/Logger.cpp b/src/core/Logger.cpp
new file mode 100644
index 0000000..9c3bf26
--- /dev/null
+++ b/src/core/Logger.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Logger.h"
+
+using namespace arm_compute;
+
+Logger::Logger()
+ : _ostream(&std::cout), _nullstream(nullptr), _verbosity(LoggerVerbosity::NONE)
+{
+}
+
+Logger &Logger::get()
+{
+ static Logger _instance;
+ return _instance;
+}
+
+void Logger::set_logger(std::ostream &ostream, LoggerVerbosity verbosity)
+{
+ _ostream = &ostream;
+ _verbosity = verbosity;
+}
+
+std::ostream &Logger::log_info()
+{
+ if(_verbosity == LoggerVerbosity::INFO)
+ {
+ return *_ostream;
+ }
+ else
+ {
+ return _nullstream;
+ }
+}
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index c8e1113..2766d69 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -203,7 +203,7 @@
vst1q_qs16(p, v);
}
-inline void internal_vst1q(int *p, const qint32x4x2_t &v)
+inline void internal_vst1q(int32_t *p, const qint32x4x2_t &v)
{
vst1q_s32(p, v.val[0]);
vst1q_s32(p + 4, v.val[1]);
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 9505a25..593a529 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -99,7 +99,7 @@
} // namespace arm_compute
NEFillBorderKernel::NEFillBorderKernel()
- : _tensor(nullptr), _border_size(0), _mode(BorderMode::UNDEFINED), _constant_border_value(0)
+ : _tensor(nullptr), _border_size(0), _mode(BorderMode::UNDEFINED), _constant_border_value(static_cast<float>(0.f))
{
}
diff --git a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
index 017e259..d1cff6f 100644
--- a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
@@ -42,7 +42,7 @@
} // namespace arm_compute
NEFillInnerBorderKernel::NEFillInnerBorderKernel()
- : _tensor(nullptr), _border_size(0), _constant_border_value(0)
+ : _tensor(nullptr), _border_size(0), _constant_border_value(static_cast<float>(0.f))
{
}
diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
index dd85ac1..72b652d 100644
--- a/src/core/NEON/kernels/NEFloorKernel.cpp
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp
@@ -39,10 +39,8 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
- set_data_type_if_unknown(*input->info(), DataType::F32);
- set_data_type_if_unknown(*output->info(), DataType::F32);
+ // Auto initialize output
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp
index 6fac797..004ecd0 100644
--- a/src/core/NEON/kernels/NELKTrackerKernel.cpp
+++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp
@@ -130,7 +130,7 @@
}
}
-std::tuple<int, int, int> NELKTrackerKernel::compute_spatial_gradient_matrix(const NELKInternalKeypoint &keypoint, int *bilinear_ix, int *bilinear_iy)
+std::tuple<int, int, int> NELKTrackerKernel::compute_spatial_gradient_matrix(const NELKInternalKeypoint &keypoint, int32_t *bilinear_ix, int32_t *bilinear_iy)
{
int iA11 = 0;
int iA12 = 0;
@@ -218,7 +218,8 @@
return std::make_tuple(iA11, iA12, iA22);
}
-std::pair<int, int> NELKTrackerKernel::compute_image_mismatch_vector(const NELKInternalKeypoint &old_keypoint, const NELKInternalKeypoint &new_keypoint, const int *bilinear_ix, const int *bilinear_iy)
+std::pair<int, int> NELKTrackerKernel::compute_image_mismatch_vector(const NELKInternalKeypoint &old_keypoint, const NELKInternalKeypoint &new_keypoint, const int32_t *bilinear_ix,
+ const int32_t *bilinear_iy)
{
int ib1 = 0;
int ib2 = 0;
@@ -402,8 +403,8 @@
init_keypoints(list_start, list_end);
const int buffer_size = _window_dimension * _window_dimension;
- int bilinear_ix[buffer_size];
- int bilinear_iy[buffer_size];
+ int32_t bilinear_ix[buffer_size];
+ int32_t bilinear_iy[buffer_size];
const int half_window = _window_dimension / 2;
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index fc3f5f2..d6d26e2 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -61,7 +61,7 @@
ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
}
- const unsigned int border_width = (norm_info.type() == NormType::CROSS_MAP) ? 0 : std::min(norm_info.norm_size() / 2, 3U);
+ const unsigned int border_width = (norm_info.type() == NormType::CROSS_MAP) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
_input = input;
_input_squared = input_squared;
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index b97564e..8d4e465 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -108,14 +108,13 @@
std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
- static const std::set<int> supported_pool_sizes = { 2, 3, 7 };
+ static const std::set<int> supported_pool_sizes = { 2, 3 };
ARM_COMPUTE_UNUSED(supported_pool_sizes);
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->info()->data_type()));
- ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end());
- ARM_COMPUTE_ERROR_ON(7 == pool_size && input->info()->data_type() != DataType::F32);
+ ARM_COMPUTE_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && (input->info()->data_type() != DataType::F32));
ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
ARM_COMPUTE_ERROR_ON(is_data_type_fixed_point(input->info()->data_type()) && pool_stride_x > 2);
@@ -207,7 +206,7 @@
num_elems_read_per_iteration = 8; // We use vload8 for pooling7
break;
default:
- ARM_COMPUTE_ERROR("Pooling size not supported");
+ num_elems_read_per_iteration = 1; // We use vload4 for poolingN but with a leftover for loop
break;
}
num_elems_processed_per_iteration = 1;
@@ -380,7 +379,20 @@
}
break;
default:
- ARM_COMPUTE_ERROR("Unsupported pooling size");
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG>;
+ break;
+ case PoolingType::L2:
+ _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
break;
}
@@ -1005,6 +1017,127 @@
input, output);
}
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window &window)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ const int pool_size = _pool_info.pool_size();
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+ const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ float res = 0.0f;
+
+ if(pooling_type != PoolingType::MAX)
+ {
+ // Calculate scale
+ const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+
+ // Perform pooling
+ float32x4_t vres = vdupq_n_f32(0.0f);
+
+ for(int y = 0; y < pool_size; ++y)
+ {
+ int x = 0;
+ for(; x <= (pool_size - 4); x += 4)
+ {
+ const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() +
+ (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+
+ // Get power of 2 in case of l2 pooling and accumulate
+ if(pooling_type == PoolingType::L2)
+ {
+ vres = vmlaq_f32(vres, data, data);
+ }
+ else
+ {
+ vres = vaddq_f32(vres, data);
+ }
+ }
+
+ // Leftover for loop
+ for(; x < pool_size; ++x)
+ {
+ float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ data *= data;
+ }
+
+ res += data;
+ }
+ }
+
+#if defined(__aarch64__)
+ // Reduction operation available on 64 bit architectures only
+ res += vaddvq_f32(vres);
+#else // __aarch64__
+ // Reduction
+ float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
+ tmp = vpadd_f32(tmp, tmp);
+
+ res += vget_lane_f32(tmp, 0);
+#endif // __aarch64__
+ // Divide by scale
+ res *= scale;
+ }
+ else
+ {
+ float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::min());
+ res = std::numeric_limits<float>::min();
+
+ for(int y = 0; y < pool_size; ++y)
+ {
+ int x = 0;
+ for(; x <= (pool_size - 4); x += 4)
+ {
+ const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() +
+ (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+ vres = vmaxq_f32(vres, data);
+ }
+
+ // Leftover for loop
+ for(; x < pool_size; ++x)
+ {
+ const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+ res = std::max(res, data);
+ }
+ }
+
+#if defined(__aarch64__)
+ // Reduction operation available on 64 bit architectures only
+ res = std::max(vmaxvq_f32(vres), res);
+#else // __aarch64__
+ float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
+ tmp = vpmax_f32(tmp, tmp);
+
+ res = std::max(res, vget_lane_f32(tmp, 0));
+#endif // __aarch64__
+ }
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ res = std::sqrt(res);
+ }
+
+ // Store result
+ *(reinterpret_cast<float *>(output.ptr())) = res;
+ },
+ input, output);
+}
+
void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
index 62f4e5d..ab8ab14 100644
--- a/src/core/NEON/kernels/NEWarpKernel.cpp
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp
@@ -143,7 +143,11 @@
const float start_y0 = M10 * window.x().start();
// Current row
- int y_cur = window.y().start();
+ int y_cur = window.y().start();
+ int z_cur = window.z().start();
+ int d3_cur = window[3].start();
+ int d4_cur = window[4].start();
+ int d5_cur = window[5].start();
// const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
float const_x0 = M01 * y_cur + M02;
@@ -155,10 +159,14 @@
execute_window_loop(window, [&](const Coordinates & id)
{
- // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
- if(y_cur != id.y())
+ // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+ if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
{
- y_cur = id.y();
+ y_cur = id.y();
+ z_cur = id.z();
+ d3_cur = id[3];
+ d4_cur = id[4];
+ d5_cur = id[5];
const_x0 = M01 * y_cur + M02;
const_y0 = M11 * y_cur + M12;
@@ -222,7 +230,11 @@
const float start_y0 = M10 * window.x().start();
// Current row
- int y_cur = window.y().start();
+ int y_cur = window.y().start();
+ int z_cur = window.z().start();
+ int d3_cur = window[3].start();
+ int d4_cur = window[4].start();
+ int d5_cur = window[5].start();
// const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
float const_x0 = M01 * y_cur + M02;
@@ -234,10 +246,14 @@
execute_window_loop(window, [&](const Coordinates & id)
{
- // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
- if(y_cur != id.y())
+ // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+ if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
{
- y_cur = id.y();
+ y_cur = id.y();
+ z_cur = id.z();
+ d3_cur = id[3];
+ d4_cur = id[4];
+ d5_cur = id[5];
const_x0 = M01 * y_cur + M02;
const_y0 = M11 * y_cur + M12;
@@ -264,7 +280,34 @@
}
else
{
- *out.ptr() = _constant_border_value;
+ switch(interpolation)
+ {
+ case InterpolationPolicy::NEAREST_NEIGHBOR:
+ *out.ptr() = _constant_border_value;
+ break;
+ case InterpolationPolicy::BILINEAR:
+ {
+ const auto xi = clamp<int>(std::floor(x0), min_x - 1, max_x);
+ const auto yi = clamp<int>(std::floor(y0), min_y - 1, max_y);
+ const auto xi_1 = clamp<int>(std::floor(x0 + 1), min_x - 1, max_x);
+ const auto yi_1 = clamp<int>(std::floor(y0 + 1), min_y - 1, max_y);
+
+ const float dx = x0 - std::floor(x0);
+ const float dy = y0 - std::floor(y0);
+ const float dx1 = 1.0f - dx;
+ const float dy1 = 1.0f - dy;
+
+ const float a00 = *(in.ptr() + xi + yi * stride);
+ const float a01 = *(in.ptr() + xi_1 + yi * stride);
+ const float a10 = *(in.ptr() + xi + yi_1 * stride);
+ const float a11 = *(in.ptr() + xi_1 + yi_1 * stride);
+
+ *out.ptr() = a00 * (dx1 * dy1) + a01 * (dx * dy1) + a10 * (dx1 * dy) + a11 * (dx * dy);
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Interpolation not supported");
+ }
}
x0 += M00;
@@ -292,7 +335,11 @@
const size_t stride = _input->info()->strides_in_bytes()[1];
// Current row
- int y_cur = window.y().start();
+ int y_cur = window.y().start();
+ int z_cur = window.z().start();
+ int d3_cur = window[3].start();
+ int d4_cur = window[4].start();
+ int d5_cur = window[5].start();
const float M00 = _matrix[0];
const float M10 = _matrix[1];
@@ -314,10 +361,14 @@
execute_window_loop(window, [&](const Coordinates & id)
{
- // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
- if(y_cur != id.y())
+ // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+ if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
{
- y_cur = id.y();
+ y_cur = id.y();
+ z_cur = id.z();
+ d3_cur = id[3];
+ d4_cur = id[4];
+ d5_cur = id[5];
const_x0 = M01 * y_cur + M02;
const_y0 = M11 * y_cur + M12;
@@ -345,10 +396,34 @@
else
{
// Clamp coordinates
- const auto xi = clamp<int>(x0, min_x, max_x - 1);
- const auto yi = clamp<int>(y0, min_y, max_y - 1);
+ const auto xi = clamp<int>(std::floor(x0), min_x, max_x - 1);
+ const auto yi = clamp<int>(std::floor(y0), min_y, max_y - 1);
+ switch(interpolation)
+ {
+ case InterpolationPolicy::NEAREST_NEIGHBOR:
+ *out.ptr() = *(in.ptr() + xi + yi * stride);
+ break;
+ case InterpolationPolicy::BILINEAR:
+ {
+ const auto xi_1 = clamp<int>(std::floor(x0 + 1), min_x, max_x - 1);
+ const auto yi_1 = clamp<int>(std::floor(y0 + 1), min_y, max_y - 1);
- *out.ptr() = *(in.ptr() + xi + yi * stride);
+ const float dx = x0 - std::floor(x0);
+ const float dy = y0 - std::floor(y0);
+ const float dx1 = 1.0f - dx;
+ const float dy1 = 1.0f - dy;
+
+ const float a00 = *(in.ptr() + xi + yi * stride);
+ const float a01 = *(in.ptr() + xi_1 + yi * stride);
+ const float a10 = *(in.ptr() + xi + yi_1 * stride);
+ const float a11 = *(in.ptr() + xi_1 + yi_1 * stride);
+
+ *out.ptr() = a00 * (dx1 * dy1) + a01 * (dx * dy1) + a10 * (dx1 * dy) + a11 * (dx * dy);
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Interpolation not supported");
+ }
}
x0 += M00;
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index 525506f..7dddb1c 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp
@@ -46,28 +46,22 @@
*
* @param _next_hint Device execution hint
*/
- void configure(Hint _next_hint);
+ void configure(GraphHints _next_hints);
- /** Sets whether to enable information print out
- *
- * @param[in] is_enabled Set to true if need info printed out
- */
- void set_info_enablement(bool is_enabled);
-
+ GraphContext _ctx{};
std::vector<Stage> _pipeline{};
std::vector<std::unique_ptr<Tensor>> _tensors{};
std::vector<std::unique_ptr<INode>> _nodes{};
- Hint _current_hint{ Hint::DONT_CARE };
- Hint _next_hint{ Hint::DONT_CARE };
+ GraphHints _current_hints{};
+ GraphHints _next_hints{};
std::unique_ptr<Tensor> _graph_input{ nullptr };
std::unique_ptr<Tensor> _graph_output{ nullptr };
std::unique_ptr<INode> _current_node{ nullptr };
Tensor *_current_output{ nullptr };
- bool _info_enabled{ false };
private:
- Tensor *_current_input{ nullptr };
- Hint _previous_hint{ Hint::DONT_CARE };
+ Tensor *_current_input{ nullptr };
+ GraphHints _previous_hints{};
};
Graph::~Graph() //NOLINT
@@ -102,7 +96,7 @@
}
//Finalize current node's configuration
-void Graph::Private::configure(Hint _next_hint)
+void Graph::Private::configure(GraphHints _next_hints)
{
ARM_COMPUTE_ERROR_ON(_current_node == nullptr);
ARM_COMPUTE_ERROR_ON(_graph_input == nullptr);
@@ -110,9 +104,9 @@
// Is it the first node of the graph ?
if(_current_input == nullptr)
{
- _graph_input->set_target(_current_hint);
- _current_input = _graph_input.get();
- _previous_hint = _current_hint; // For the first node just assume the previous node was of the same type as this one
+ _graph_input->set_target(_current_hints.target_hint());
+ _current_input = _graph_input.get();
+ _previous_hints = _current_hints; // For the first node just assume the previous node was of the same type as this one
}
//Automatic output configuration ?
@@ -123,29 +117,31 @@
}
// If either the writer or reader node needs OpenCL then use OpenCL memory:
- if((_next_hint == Hint::OPENCL || _current_hint == Hint::OPENCL))
+ if((_next_hints.target_hint() == TargetHint::OPENCL || _current_hints.target_hint() == TargetHint::OPENCL))
{
- _current_output->set_target(Hint::OPENCL);
+ _current_output->set_target(TargetHint::OPENCL);
}
else
{
- _current_output->set_target(Hint::NEON);
+ _current_output->set_target(TargetHint::NEON);
}
- // Map input if needed
- std::unique_ptr<arm_compute::IFunction> func = _current_node->instantiate_node(_current_hint, _current_input->tensor(), _current_output->tensor());
+ // Update ctx and instantiate node
+ _ctx.hints() = _current_hints;
+ std::unique_ptr<arm_compute::IFunction> func = _current_node->instantiate_node(_ctx, _current_input->tensor(), _current_output->tensor());
_current_input->allocate();
- if(_current_input->target() == Hint::OPENCL)
+ // Map input if needed
+ if(_current_input->target() == TargetHint::OPENCL)
{
- if(_previous_hint == Hint::NEON)
+ if(_previous_hints.target_hint() == TargetHint::NEON)
{
- ARM_COMPUTE_ERROR_ON(_current_hint == Hint::NEON);
+ ARM_COMPUTE_ERROR_ON(_current_hints.target_hint() == TargetHint::NEON);
_pipeline.push_back({ _current_input, _current_input, arm_compute::support::cpp14::make_unique<CLUnmap>(_current_input) });
}
- if(_current_hint == Hint::NEON)
+ if(_current_hints.target_hint() == TargetHint::NEON)
{
- ARM_COMPUTE_ERROR_ON(_previous_hint == Hint::NEON);
+ ARM_COMPUTE_ERROR_ON(_previous_hints.target_hint() == TargetHint::NEON);
_pipeline.push_back({ _current_input, _current_input, arm_compute::support::cpp14::make_unique<CLMap>(_current_input, true) });
}
}
@@ -154,13 +150,8 @@
_current_input = _current_output;
_current_output = nullptr;
- _previous_hint = _current_hint;
- _current_hint = _next_hint;
-}
-
-void Graph::Private::set_info_enablement(bool is_enabled)
-{
- _info_enabled = is_enabled;
+ std::swap(_previous_hints, _current_hints);
+ std::swap(_current_hints, _next_hints);
}
void Graph::add_node(std::unique_ptr<INode> node)
@@ -169,22 +160,18 @@
ARM_COMPUTE_ERROR_ON_MSG(_pimpl->_graph_output != nullptr, "Nothing can be added after the output tensor");
//Trigger the creation of the current Node:
- Hint _next_hint = node->override_hint(_pimpl->_next_hint);
- ARM_COMPUTE_ERROR_ON(_next_hint == Hint::DONT_CARE);
+ GraphHints _next_hints = _pimpl->_next_hints;
+ _next_hints.set_target_hint(node->override_target_hint(_pimpl->_next_hints.target_hint()));
+ ARM_COMPUTE_ERROR_ON(_next_hints.target_hint() == TargetHint::DONT_CARE);
if(_pimpl->_current_node)
{
//Finalize the previous Node:
- _pimpl->configure(_pimpl->_next_hint);
-
- if(_pimpl->_info_enabled)
- {
- _pimpl->_current_node->print_info();
- }
+ _pimpl->configure(_pimpl->_next_hints);
}
else
{
- // If that's the first node then use the same Hint before and after the node.
- _pimpl->_current_hint = _next_hint;
+ // If that's the first node then use the same TargetHint before and after the node.
+ _pimpl->_current_hints = _next_hints;
}
if(_pimpl->_current_node)
{
@@ -192,15 +179,6 @@
}
_pimpl->_current_node = std::move(node);
}
-void Graph::set_hint(Hint hint)
-{
- _pimpl->_next_hint = hint;
-}
-
-void Graph::set_info_enablement(bool is_enabled)
-{
- _pimpl->set_info_enablement(is_enabled);
-}
//Add a tensor with an Accessor (i.e either the input or output of the graph)
void Graph::add_tensor(std::unique_ptr<Tensor> tensor)
@@ -221,7 +199,7 @@
_pimpl->_current_output = _pimpl->_graph_output.get();
// Finalize the graph by configuring the last Node of the graph:
- _pimpl->configure(_pimpl->_current_hint); // Ignore _next_hint as this is the last node, and just use the same hint as before this node.
+ _pimpl->configure(_pimpl->_current_hints); // Ignore _next_hint as this is the last node, and just use the same hint as before this node.
_pimpl->_graph_output->allocate();
}
}
@@ -236,6 +214,11 @@
_pimpl->_current_output = _pimpl->_tensors.back().get();
}
+GraphHints &Graph::hints()
+{
+ return _pimpl->_next_hints;
+}
+
Graph &arm_compute::graph::operator<<(Graph &graph, TensorInfo &&info)
{
graph.set_temp(std::move(info));
@@ -248,8 +231,14 @@
return graph;
}
-Graph &arm_compute::graph::operator<<(Graph &graph, Hint hint)
+Graph &arm_compute::graph::operator<<(Graph &graph, TargetHint target_hint)
{
- graph.set_hint(hint);
+ graph.hints().set_target_hint(target_hint);
+ return graph;
+}
+
+Graph &arm_compute::graph::operator<<(Graph &graph, ConvolutionMethodHint conv_method_hint)
+{
+ graph.hints().set_convolution_method_hint(conv_method_hint);
return graph;
}
diff --git a/src/graph/GraphContext.cpp b/src/graph/GraphContext.cpp
new file mode 100644
index 0000000..bfc6fcd
--- /dev/null
+++ b/src/graph/GraphContext.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/GraphContext.h"
+
+using namespace arm_compute::graph;
+
+GraphHints::GraphHints(TargetHint target_hint, ConvolutionMethodHint conv_method_hint)
+ : _target_hint(target_hint), _convolution_method_hint(conv_method_hint)
+{
+}
+
+void GraphHints::set_target_hint(TargetHint target_hint)
+{
+ _target_hint = target_hint;
+}
+
+void GraphHints::set_convolution_method_hint(ConvolutionMethodHint convolution_method)
+{
+ _convolution_method_hint = convolution_method;
+}
+
+TargetHint GraphHints::target_hint() const
+{
+ return _target_hint;
+}
+
+ConvolutionMethodHint GraphHints::convolution_method_hint() const
+{
+ return _convolution_method_hint;
+}
+
+GraphContext::GraphContext()
+ : _hints()
+{
+}
+
+GraphHints &GraphContext::hints()
+{
+ return _hints;
+}
+
+const GraphHints &GraphContext::hints() const
+{
+ return _hints;
+}
\ No newline at end of file
diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index 6b25022..4b383f5 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#include "arm_compute/graph/INode.h"
#include "arm_compute/core/CL/OpenCL.h"
@@ -31,17 +30,20 @@
using namespace arm_compute::graph;
-Hint INode::override_hint(Hint hint) const
+TargetHint INode::override_target_hint(TargetHint target_hint) const
{
- if(hint == Hint::OPENCL && !opencl_is_available())
+ if(target_hint == TargetHint::OPENCL && !opencl_is_available())
{
- hint = Hint::DONT_CARE;
+ target_hint = TargetHint::DONT_CARE;
}
- hint = node_override_hint(hint);
- ARM_COMPUTE_ERROR_ON(hint == Hint::OPENCL && !opencl_is_available());
- return hint;
+ GraphHints hints{ target_hint };
+ target_hint = node_override_hints(hints).target_hint();
+ ARM_COMPUTE_ERROR_ON(target_hint == TargetHint::OPENCL && !opencl_is_available());
+ return target_hint;
}
-Hint INode::node_override_hint(Hint hint) const
+GraphHints INode::node_override_hints(GraphHints hints) const
{
- return hint == Hint::DONT_CARE ? Hint::NEON : hint;
+ TargetHint target_hint = hints.target_hint();
+ hints.set_target_hint((target_hint == TargetHint::DONT_CARE) ? TargetHint::NEON : target_hint);
+ return hints;
}
diff --git a/src/graph/SubTensor.cpp b/src/graph/SubTensor.cpp
new file mode 100644
index 0000000..abf8506
--- /dev/null
+++ b/src/graph/SubTensor.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/SubTensor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/SubTensor.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename SubTensorType, typename ParentTensorType>
+std::unique_ptr<ITensor> initialise_subtensor(ITensor *parent, TensorShape shape, Coordinates coords)
+{
+ auto ptensor = dynamic_cast<ParentTensorType *>(parent);
+ auto subtensor = arm_compute::support::cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
+ return std::move(subtensor);
+}
+} // namespace
+
+SubTensor::SubTensor()
+ : _target(TargetHint::DONT_CARE), _coords(), _info(), _parent(nullptr), _subtensor(nullptr)
+{
+}
+
+SubTensor::SubTensor(Tensor &parent, TensorShape tensor_shape, Coordinates coords)
+ : _target(TargetHint::DONT_CARE), _coords(coords), _info(), _parent(nullptr), _subtensor(nullptr)
+{
+ ARM_COMPUTE_ERROR_ON(parent.tensor() == nullptr);
+ _parent = parent.tensor();
+ _info = SubTensorInfo(parent.tensor()->info(), tensor_shape, coords);
+ _target = parent.target();
+
+ instantiate_subtensor();
+}
+
+SubTensor::SubTensor(ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target)
+ : _target(target), _coords(coords), _info(), _parent(parent), _subtensor(nullptr)
+{
+ ARM_COMPUTE_ERROR_ON(parent == nullptr);
+ _info = SubTensorInfo(parent->info(), tensor_shape, coords);
+
+ instantiate_subtensor();
+}
+
+void SubTensor::set_info(SubTensorInfo &&info)
+{
+ _info = info;
+}
+
+const SubTensorInfo &SubTensor::info() const
+{
+ return _info;
+}
+
+ITensor *SubTensor::tensor()
+{
+ return _subtensor.get();
+}
+
+TargetHint SubTensor::target() const
+{
+ return _target;
+}
+
+void SubTensor::instantiate_subtensor()
+{
+ switch(_target)
+ {
+ case TargetHint::OPENCL:
+ _subtensor = initialise_subtensor<arm_compute::CLSubTensor, arm_compute::ICLTensor>(_parent, _info.tensor_shape(), _coords);
+ break;
+ case TargetHint::NEON:
+ _subtensor = initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(_parent, _info.tensor_shape(), _coords);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Invalid TargetHint");
+ }
+}
diff --git a/src/graph/Tensor.cpp b/src/graph/Tensor.cpp
index c534ae0..31dd4e8 100644
--- a/src/graph/Tensor.cpp
+++ b/src/graph/Tensor.cpp
@@ -21,7 +21,6 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#include "arm_compute/graph/Tensor.h"
#include "arm_compute/core/Error.h"
@@ -53,7 +52,7 @@
} // namespace
Tensor::Tensor(TensorInfo &&info)
- : _target(Hint::DONT_CARE), _info(info), _accessor(nullptr), _tensor(nullptr)
+ : _target(TargetHint::DONT_CARE), _info(info), _accessor(nullptr), _tensor(nullptr)
{
}
@@ -96,7 +95,7 @@
return _info;
}
-ITensor *Tensor::set_target(Hint target)
+ITensor *Tensor::set_target(TargetHint target)
{
if(_tensor != nullptr)
{
@@ -106,14 +105,14 @@
{
switch(target)
{
- case Hint::OPENCL:
+ case TargetHint::OPENCL:
_tensor = initialise_tensor<arm_compute::CLTensor>(_info);
break;
- case Hint::NEON:
+ case TargetHint::NEON:
_tensor = initialise_tensor<arm_compute::Tensor>(_info);
break;
default:
- ARM_COMPUTE_ERROR("Invalid Hint");
+ ARM_COMPUTE_ERROR("Invalid TargetHint");
}
_target = target;
}
@@ -125,14 +124,14 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor.get());
switch(_target)
{
- case Hint::OPENCL:
+ case TargetHint::OPENCL:
tensor_allocate<arm_compute::CLTensor>(*_tensor);
break;
- case Hint::NEON:
+ case TargetHint::NEON:
tensor_allocate<arm_compute::Tensor>(*_tensor);
break;
default:
- ARM_COMPUTE_ERROR("Invalid Hint");
+ ARM_COMPUTE_ERROR("Invalid TargetHint");
}
}
@@ -145,7 +144,7 @@
}
}
-Hint Tensor::target() const
+TargetHint Tensor::target() const
{
return _target;
}
diff --git a/src/graph/nodes/ActivationLayer.cpp b/src/graph/nodes/ActivationLayer.cpp
index b71e22c..5cd2a0b 100644
--- a/src/graph/nodes/ActivationLayer.cpp
+++ b/src/graph/nodes/ActivationLayer.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/graph/nodes/ActivationLayer.h"
+#include "arm_compute/core/Logger.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
@@ -34,7 +35,7 @@
namespace
{
-template <typename ActivationType, typename TensorType, Hint hint>
+template <typename ActivationType, typename TensorType, TargetHint target_hint>
std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
{
auto activation = arm_compute::support::cpp14::make_unique<ActivationType>();
@@ -46,19 +47,19 @@
return std::move(activation);
}
-template <Hint hint>
+template <TargetHint target_hint>
std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info);
template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
{
- return instantiate_function<arm_compute::CLActivationLayer, arm_compute::CLTensor, Hint::OPENCL>(input, output, activation_info);
+ return instantiate_function<arm_compute::CLActivationLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, output, activation_info);
}
template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
{
- return instantiate_function<arm_compute::NEActivationLayer, arm_compute::Tensor, Hint::NEON>(input, output, activation_info);
+ return instantiate_function<arm_compute::NEActivationLayer, arm_compute::Tensor, TargetHint::NEON>(input, output, activation_info);
}
} // namespace
@@ -67,40 +68,28 @@
{
}
-std::unique_ptr<arm_compute::IFunction> ActivationLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> ActivationLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
{
std::unique_ptr<arm_compute::IFunction> func;
- _hint = hint;
- _input = input;
- _output = output;
+ _target_hint = ctx.hints().target_hint();
- if(_hint == Hint::OPENCL)
+ if(_target_hint == TargetHint::OPENCL)
{
- func = instantiate<Hint::OPENCL>(input, output, _activation_info);
+ func = instantiate<TargetHint::OPENCL>(input, output, _activation_info);
+ ARM_COMPUTE_LOG("Instantiating CLActivationLayer");
}
else
{
- func = instantiate<Hint::NEON>(input, output, _activation_info);
+ func = instantiate<TargetHint::NEON>(input, output, _activation_info);
+ ARM_COMPUTE_LOG("Instantiating NEActivationLayer");
}
+
+ ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << " Activation function: " << _activation_info.activation()
+ << " a: " << _activation_info.a()
+ << " b: " << _activation_info.b()
+ << std::endl);
return func;
}
-
-void ActivationLayer::print_info()
-{
- if(_hint == Hint::OPENCL)
- {
- std::cout << "Instantiating CLActivationLayer";
- }
- else
- {
- std::cout << "Instantiating NEActivationLayer";
- }
-
- std::cout << " Data Type: " << _input->info()->data_type()
- << " Input shape: " << _input->info()->tensor_shape()
- << " Output shape: " << _output->info()->tensor_shape()
- << " Activation function: " << _activation_info.activation()
- << " a: " << _activation_info.a()
- << " b: " << _activation_info.b()
- << std::endl;
-}
diff --git a/src/graph/nodes/BatchNormalizationLayer.cpp b/src/graph/nodes/BatchNormalizationLayer.cpp
new file mode 100644
index 0000000..a6a990f
--- /dev/null
+++ b/src/graph/nodes/BatchNormalizationLayer.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/BatchNormalizationLayer.h"
+
+#include "arm_compute/core/Logger.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename BatchBatchNormalizationLayer, typename TensorType, TargetHint target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, Tensor &mean, Tensor &var, Tensor &beta, Tensor &gamma, float epsilon)
+{
+ auto norm = arm_compute::support::cpp14::make_unique<BatchBatchNormalizationLayer>();
+ norm->configure(
+ dynamic_cast<TensorType *>(input),
+ dynamic_cast<TensorType *>(output),
+ dynamic_cast<TensorType *>(mean.set_target(target_hint)),
+ dynamic_cast<TensorType *>(var.set_target(target_hint)),
+ dynamic_cast<TensorType *>(beta.set_target(target_hint)),
+ dynamic_cast<TensorType *>(gamma.set_target(target_hint)),
+ epsilon);
+
+ return std::move(norm);
+}
+
+template <TargetHint target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, Tensor &mean, Tensor &var, Tensor &beta, Tensor &gamma, float epsilon);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, Tensor &mean, Tensor &var, Tensor &beta, Tensor &gamma, float epsilon)
+{
+ return instantiate_function<arm_compute::CLBatchNormalizationLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, output, mean, var, beta, gamma, epsilon);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, Tensor &mean, Tensor &var, Tensor &beta, Tensor &gamma, float epsilon)
+{
+ return instantiate_function<arm_compute::NEBatchNormalizationLayer, arm_compute::ITensor, TargetHint::NEON>(input, output, mean, var, beta, gamma, epsilon);
+}
+} // namespace
+
+std::unique_ptr<arm_compute::IFunction> BatchNormalizationLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+{
+ std::unique_ptr<arm_compute::IFunction> func;
+ _target_hint = ctx.hints().target_hint();
+
+ unsigned int batch_norm_size = input->info()->dimension(2);
+ if(_mean.tensor() == nullptr)
+ {
+ _mean.set_info(TensorInfo(TensorShape(batch_norm_size), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+ }
+ if(_var.tensor() == nullptr)
+ {
+ _var.set_info(TensorInfo(TensorShape(batch_norm_size), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+ }
+ if(_beta.tensor() == nullptr)
+ {
+ _beta.set_info(TensorInfo(TensorShape(batch_norm_size), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+ }
+ if(_gamma.tensor() == nullptr)
+ {
+ _gamma.set_info(TensorInfo(TensorShape(batch_norm_size), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+ }
+
+ if(_target_hint == TargetHint::OPENCL)
+ {
+ func = instantiate<TargetHint::OPENCL>(input, output, _mean, _var, _beta, _gamma, _epsilon);
+ ARM_COMPUTE_LOG("Instantiating CLBatchNormalizationLayer");
+ }
+ else
+ {
+ func = instantiate<TargetHint::NEON>(input, output, _mean, _var, _beta, _gamma, _epsilon);
+ ARM_COMPUTE_LOG("Instantiating NEBatchNormalizationLayer");
+ }
+
+ ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+
+ return func;
+}
\ No newline at end of file
diff --git a/src/graph/nodes/ConvolutionLayer.cpp b/src/graph/nodes/ConvolutionLayer.cpp
index b80bf93..b47be8d 100644
--- a/src/graph/nodes/ConvolutionLayer.cpp
+++ b/src/graph/nodes/ConvolutionLayer.cpp
@@ -23,61 +23,159 @@
*/
#include "arm_compute/graph/nodes/ConvolutionLayer.h"
+#include "arm_compute/core/Logger.h"
#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
+#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
#include "support/ToolchainSupport.h"
+#include "utils/GraphTypePrinter.h"
#include "utils/TypePrinter.h"
+#include <tuple>
+#include <vector>
+
using namespace arm_compute::graph;
namespace
{
-template <typename ConvolutionType, typename TensorType, Hint hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+/** Calculates the output shaped of the convolution layer
+ *
+ * @param[in] input_shape Input tensor shape
+ * @param[in] weights_shape Weights shape
+ * @param[in] conv_info Convolution information (padding, stride, etc.)
+ *
+ * @return The expected output tensor shape
+ */
+TensorShape calculate_convolution_layer_output_shape(const TensorShape &input_shape, const TensorShape &weights_shape, const PadStrideInfo &conv_info)
{
- bool weights_are_loaded = weights.tensor() != nullptr;
- bool biases_are_loaded = biases.tensor() != nullptr;
+ unsigned int output_width = 0;
+ unsigned int output_height = 0;
+ // Get output width and height
+ std::tie(output_width, output_height) = arm_compute::scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
+
+ // Create output shape
+ TensorShape output_shape = input_shape;
+ output_shape.set(0, output_width);
+ output_shape.set(1, output_height);
+ output_shape.set(2, weights_shape[3]);
+
+ return output_shape;
+}
+
+// Instantiate GEMM based convolution layer
+template <typename ConvolutionType, typename TensorType, TargetHint target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
auto conv = arm_compute::support::cpp14::make_unique<ConvolutionType>();
conv->configure(
dynamic_cast<TensorType *>(input),
- dynamic_cast<TensorType *>(weights.set_target(hint)),
- dynamic_cast<TensorType *>(biases.set_target(hint)),
+ dynamic_cast<TensorType *>(weights),
+ dynamic_cast<TensorType *>(biases),
dynamic_cast<TensorType *>(output),
conv_info, weights_info);
- if(!weights_are_loaded)
- {
- weights.allocate_and_fill_if_needed();
- }
- if(!biases_are_loaded)
- {
- biases.allocate_and_fill_if_needed();
- }
-
return std::move(conv);
}
-template <Hint hint>
-std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info);
+// Instantiate direct convolution layer
+template <typename ConvolutionType, typename TensorType, TargetHint target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_direct_function(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+ auto conv = arm_compute::support::cpp14::make_unique<ConvolutionType>();
+ conv->configure(
+ dynamic_cast<TensorType *>(input),
+ dynamic_cast<TensorType *>(weights),
+ dynamic_cast<TensorType *>(biases),
+ dynamic_cast<TensorType *>(output),
+ conv_info);
+ return std::move(conv);
+}
+
+template <TargetHint target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ ConvolutionMethodHint conv_method);
template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ ConvolutionMethodHint conv_method)
{
- return instantiate_function<arm_compute::CLConvolutionLayer, arm_compute::CLTensor, Hint::OPENCL>(input, weights, biases, output, conv_info, weights_info);
+ if(conv_method == ConvolutionMethodHint::GEMM)
+ {
+ return instantiate_function<arm_compute::CLConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info, weights_info);
+ }
+ else
+ {
+ return instantiate_direct_function<arm_compute::CLDirectConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info);
+ }
}
template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ ConvolutionMethodHint conv_method)
{
- return instantiate_function<arm_compute::NEConvolutionLayer, arm_compute::Tensor, Hint::NEON>(input, weights, biases, output, conv_info, weights_info);
+ if(conv_method == ConvolutionMethodHint::GEMM)
+ {
+ return instantiate_function<arm_compute::NEConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info, weights_info);
+ }
+ else
+ {
+ return instantiate_direct_function<arm_compute::NEDirectConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info);
+ }
}
} // namespace
-std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+/** Grouped Convolution function */
+class GroupedConvolutionFunction final : public arm_compute::IFunction
{
+public:
+ /** Default Constructor */
+ GroupedConvolutionFunction()
+ : _convolutions()
+ {
+ }
+ /** Default Destructor */
+ ~GroupedConvolutionFunction() final = default;
+ /** Prevent instances from being copy constructed */
+ GroupedConvolutionFunction(const GroupedConvolutionFunction &) = delete;
+ /** Prevent instances from being copy assigned */
+ GroupedConvolutionFunction &operator=(const GroupedConvolutionFunction &) = delete;
+ /** Allow instances to be move constructed */
+ GroupedConvolutionFunction(GroupedConvolutionFunction &&) noexcept = default;
+ /** Allow instances to be move assigned */
+ GroupedConvolutionFunction &operator=(GroupedConvolutionFunction &&) noexcept = default;
+ /** Adds a convolution
+ *
+ * @param convolution Convolution function to add
+ */
+ void add_convolution_function(std::unique_ptr<IFunction> convolution)
+ {
+ _convolutions.emplace_back(std::move(convolution));
+ }
+
+ // Inherited methods overriden:
+ void run() override
+ {
+ for(auto &c : _convolutions)
+ {
+ c->run();
+ }
+ }
+
+private:
+ std::vector<std::unique_ptr<IFunction>> _convolutions;
+};
+
+std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+{
+ // Set weights and biases info
if(_weights.tensor() == nullptr)
{
- _weights.set_info(TensorInfo(TensorShape(_conv_width, _conv_height, input->info()->dimension(2), _ofm), input->info()->num_channels(), input->info()->data_type(),
+ _weights.set_info(TensorInfo(TensorShape(_conv_width, _conv_height, input->info()->dimension(2) / _num_groups, _ofm),
+ input->info()->num_channels(), input->info()->data_type(),
input->info()->fixed_point_position()));
}
if(_biases.tensor() == nullptr)
@@ -86,32 +184,139 @@
}
std::unique_ptr<arm_compute::IFunction> func;
- _hint = hint;
- _input = input;
- _output = output;
+ _target_hint = ctx.hints().target_hint();
+ const ConvolutionMethodHint conv_method_hint = ctx.hints().convolution_method_hint();
- if(_hint == Hint::OPENCL)
+ // Check if the weights and biases are loaded
+ bool weights_are_loaded = _weights.tensor() != nullptr;
+ bool biases_are_loaded = _weights.tensor() != nullptr;
+
+ // Set bias and weights target
+ _weights.set_target(_target_hint);
+ _biases.set_target(_target_hint);
+
+ // Calculate output shape
+ TensorShape output_shape = calculate_convolution_layer_output_shape(input->info()->tensor_shape(), _weights.info().tensor_shape(), _conv_info);
+
+ // Output auto inizialitation if not yet initialized
+ arm_compute::auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ // Create appropriate convolution function
+ if(_num_groups == 1)
{
- func = instantiate<Hint::OPENCL>(input, _weights, _biases, output, _conv_info, _weights_info);
+ func = instantiate_convolution(input, output, conv_method_hint);
+ ARM_COMPUTE_LOG("Instantiating CLConvolutionLayer");
}
else
{
- func = instantiate<Hint::NEON>(input, _weights, _biases, output, _conv_info, _weights_info);
+ func = instantiate_grouped_convolution(input, output, conv_method_hint);
+ ARM_COMPUTE_LOG("Instantiating NEConvolutionLayer");
}
+ // Fill weights
+ if(!weights_are_loaded)
+ {
+ _weights.allocate_and_fill_if_needed();
+ }
+ // Fill biases
+ if(!biases_are_loaded)
+ {
+ _biases.allocate_and_fill_if_needed();
+ }
+
+ ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+ << " Input Shape: " << input->info()->tensor_shape()
+ << " Weights shape: " << _weights.info().tensor_shape()
+ << " Biases Shape: " << _biases.info().tensor_shape()
+ << " Output Shape: " << output->info()->tensor_shape()
+ << " PadStrideInfo: " << _conv_info
+ << " Groups: " << _num_groups
+ << " WeightsInfo: " << _weights_info
+ << std::endl);
+
return func;
}
-void ConvolutionLayer::print_info()
+std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_convolution(ITensor *input, ITensor *output, ConvolutionMethodHint conv_method_hint)
{
- if(_hint == Hint::OPENCL)
+ std::unique_ptr<arm_compute::IFunction> func;
+ if(_target_hint == TargetHint::OPENCL)
{
- std::cout << "Instantiating CLConvolutionLayer";
+ func = instantiate<TargetHint::OPENCL>(input, _weights.tensor(), _biases.tensor(), output, _conv_info, _weights_info, conv_method_hint);
}
else
{
- std::cout << "Instantiating NEConvolutionLayer";
+ func = instantiate<TargetHint::NEON>(input, _weights.tensor(), _biases.tensor(), output, _conv_info, _weights_info, conv_method_hint);
}
- std::cout << " Type: " << _input->info()->data_type() << " Input Shape: " << _input->info()->tensor_shape() << " Weights shape: " << _weights.info().tensor_shape() << " Biases Shape: " <<
- _biases.info().tensor_shape() << " Output Shape: " << _output->info()->tensor_shape() << " PadStrideInfo: " << _conv_info << "WeightsInfo: " << _weights_info << std::endl;
+ return func;
+}
+
+std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_grouped_convolution(ITensor *input, ITensor *output, ConvolutionMethodHint conv_method_hint)
+{
+ // Get tensor shapes
+ TensorShape input_shape = input->info()->tensor_shape();
+ TensorShape output_shape = output->info()->tensor_shape();
+ TensorShape weights_shape = _weights.info().tensor_shape();
+ TensorShape biases_shape = _biases.info().tensor_shape();
+
+ ARM_COMPUTE_ERROR_ON_MSG((input_shape.z() % _num_groups) != 0, "Input depth not multiple of the number of groups!");
+ ARM_COMPUTE_ERROR_ON_MSG((output_shape.z() % _num_groups) != 0, "Output depth not multiple of the number of groups!");
+ ARM_COMPUTE_ERROR_ON_MSG((weights_shape[3] % _num_groups) != 0, "Number of kernels not multiple of the number of groups!");
+ ARM_COMPUTE_ERROR_ON_MSG((biases_shape.x() % _num_groups) != 0, "Biases not multiple of the number of groups!");
+
+ // Create a grouped convolution function
+ auto grouped_conv = arm_compute::support::cpp14::make_unique<GroupedConvolutionFunction>();
+
+ // Create sub-tensors vectors
+ _is = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
+ _os = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
+ _ws = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
+ _bs = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
+
+ // Calculate sub-tensor splits
+ const int input_split = input_shape.z() / _num_groups;
+ const int output_split = output_shape.z() / _num_groups;
+ const int weights_split = weights_shape[3] / _num_groups;
+ const int biases_split = biases_shape.x() / _num_groups;
+
+ // Calculate sub-tensor shapes
+ input_shape.set(2, input_split);
+ output_shape.set(2, output_split);
+ weights_shape.set(3, weights_split);
+ biases_shape.set(0, biases_split);
+
+ // Configure sub-tensors
+ for(int i = 0; i < static_cast<int>(_num_groups); ++i)
+ {
+ // Create convolution function
+ std::unique_ptr<arm_compute::IFunction> func;
+
+ // Calculate sub-tensors starting coordinates
+ Coordinates input_coord(0, 0, input_split * i);
+ Coordinates output_coord(0, 0, output_split * i);
+ Coordinates weights_coord(0, 0, 0, weights_split * i);
+ Coordinates biases_coord(biases_split * i);
+
+ // Create sub-tensors for input, output, weights and bias
+ auto hint_to_use = (_target_hint == TargetHint::OPENCL) ? TargetHint::OPENCL : TargetHint::NEON;
+ _is[i] = SubTensor(input, input_shape, input_coord, hint_to_use);
+ _os[i] = SubTensor(output, output_shape, output_coord, hint_to_use);
+ _ws[i] = SubTensor(_weights.tensor(), weights_shape, weights_coord, hint_to_use);
+ _bs[i] = SubTensor(_biases.tensor(), biases_shape, biases_coord, hint_to_use);
+
+ // Instantiate convolution function
+ if(_target_hint == TargetHint::OPENCL)
+ {
+ func = instantiate<TargetHint::OPENCL>(_is[i].tensor(), _ws[i].tensor(), _bs[i].tensor(), _os[i].tensor(), _conv_info, _weights_info, conv_method_hint);
+ }
+ else
+ {
+ func = instantiate<TargetHint::NEON>(_is[i].tensor(), _ws[i].tensor(), _bs[i].tensor(), _os[i].tensor(), _conv_info, _weights_info, conv_method_hint);
+ }
+
+ // Add convolution function to the list of convolutions for the grouped convolution
+ grouped_conv->add_convolution_function(std::move(func));
+ }
+
+ return std::move(grouped_conv);
}
diff --git a/src/graph/nodes/FloorLayer.cpp b/src/graph/nodes/FloorLayer.cpp
new file mode 100644
index 0000000..722cfdf
--- /dev/null
+++ b/src/graph/nodes/FloorLayer.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/FloorLayer.h"
+
+#include "arm_compute/core/Logger.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLFloor.h"
+#include "arm_compute/runtime/NEON/functions/NEFloor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename FloorType, typename TensorType, TargetHint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output)
+{
+ auto floorlayer = arm_compute::support::cpp14::make_unique<FloorType>();
+ floorlayer->configure(
+ dynamic_cast<TensorType *>(input),
+ dynamic_cast<TensorType *>(output));
+
+ return std::move(floorlayer);
+}
+
+template <TargetHint target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output)
+{
+ return instantiate_function<arm_compute::CLFloor, arm_compute::ICLTensor, TargetHint::OPENCL>(input, output);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output)
+{
+ return instantiate_function<arm_compute::NEFloor, arm_compute::ITensor, TargetHint::NEON>(input, output);
+}
+} // namespace
+
+std::unique_ptr<arm_compute::IFunction> FloorLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+{
+ std::unique_ptr<arm_compute::IFunction> func;
+ _target_hint = ctx.hints().target_hint();
+
+ if(_target_hint == TargetHint::OPENCL)
+ {
+ func = instantiate<TargetHint::OPENCL>(input, output);
+ ARM_COMPUTE_LOG("Instantiating CLFloorLayer");
+ }
+ else
+ {
+ func = instantiate<TargetHint::NEON>(input, output);
+ ARM_COMPUTE_LOG("Instantiating NEFloorLayer");
+ }
+
+ ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+
+ return func;
+}
diff --git a/src/graph/nodes/FullyConnectedLayer.cpp b/src/graph/nodes/FullyConnectedLayer.cpp
index 8d244cb..6b21810 100644
--- a/src/graph/nodes/FullyConnectedLayer.cpp
+++ b/src/graph/nodes/FullyConnectedLayer.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/graph/nodes/FullyConnectedLayer.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Logger.h"
#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
#include "support/ToolchainSupport.h"
@@ -33,7 +34,17 @@
namespace
{
-template <typename FullyConnectedType, typename TensorType, Hint hint>
+TensorShape calculate_fullyconnected_layer_output_shape(const TensorShape &input_shape, unsigned int output_neurons)
+{
+ // Note: Only 1D batch space is supported at the moment
+ unsigned int batches = input_shape[1];
+ if(input_shape.num_dimensions() > 2)
+ {
+ batches = input_shape[3];
+ }
+ return TensorShape(output_neurons, batches);
+}
+template <typename FullyConnectedType, typename TensorType, TargetHint target_hint>
std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
{
bool weights_are_loaded = weights.tensor() != nullptr;
@@ -42,8 +53,8 @@
auto conv = arm_compute::support::cpp14::make_unique<FullyConnectedType>();
conv->configure(
dynamic_cast<TensorType *>(input),
- dynamic_cast<TensorType *>(weights.set_target(hint)),
- dynamic_cast<TensorType *>(biases.set_target(hint)),
+ dynamic_cast<TensorType *>(weights.set_target(target_hint)),
+ dynamic_cast<TensorType *>(biases.set_target(target_hint)),
dynamic_cast<TensorType *>(output));
if(!weights_are_loaded)
{
@@ -57,23 +68,23 @@
return std::move(conv);
}
-template <Hint hint>
+template <TargetHint target_hint>
std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output);
template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
{
- return instantiate_function<arm_compute::CLFullyConnectedLayer, arm_compute::CLTensor, Hint::OPENCL>(input, weights, biases, output);
+ return instantiate_function<arm_compute::CLFullyConnectedLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, weights, biases, output);
}
template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
{
- return instantiate_function<arm_compute::NEFullyConnectedLayer, arm_compute::Tensor, Hint::NEON>(input, weights, biases, output);
+ return instantiate_function<arm_compute::NEFullyConnectedLayer, arm_compute::Tensor, TargetHint::NEON>(input, weights, biases, output);
}
} // namespace
-std::unique_ptr<arm_compute::IFunction> FullyConnectedLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> FullyConnectedLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
{
if(_weights.tensor() == nullptr)
{
@@ -95,36 +106,31 @@
_biases.set_info(TensorInfo(TensorShape(_num_neurons), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
}
- arm_compute::auto_init_if_empty(*output->info(), TensorShape(_num_neurons, input->info()->dimension(1)), input->info()->num_channels(), input->info()->data_type(),
- input->info()->fixed_point_position());
+ // Auto configure output
+ arm_compute::auto_init_if_empty(*output->info(),
+ calculate_fullyconnected_layer_output_shape(input->info()->tensor_shape(), _num_neurons),
+ input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
std::unique_ptr<arm_compute::IFunction> func;
- _hint = hint;
- _input = input;
- _output = output;
+ _target_hint = ctx.hints().target_hint();
- if(_hint == Hint::OPENCL)
+ if(_target_hint == TargetHint::OPENCL)
{
- func = instantiate<Hint::OPENCL>(input, _weights, _biases, output);
+ func = instantiate<TargetHint::OPENCL>(input, _weights, _biases, output);
+ ARM_COMPUTE_LOG("Instantiating CLFullyConnectedLayer");
}
else
{
- func = instantiate<Hint::NEON>(input, _weights, _biases, output);
+ func = instantiate<TargetHint::NEON>(input, _weights, _biases, output);
+ ARM_COMPUTE_LOG("Instantiating NEFullyConnectedLayer");
}
+ ARM_COMPUTE_LOG(" Type: " << input->info()->data_type()
+ << " Input Shape: " << input->info()->tensor_shape()
+ << " Weights shape: " << _weights.info().tensor_shape()
+ << " Biases Shape: " << _biases.info().tensor_shape()
+ << " Output Shape: " << output->info()->tensor_shape()
+ << std::endl);
+
return func;
}
-
-void FullyConnectedLayer::print_info()
-{
- if(_hint == Hint::OPENCL)
- {
- std::cout << "Instantiating CLFullyConnectedLayer";
- }
- else
- {
- std::cout << "Instantiating NEFullyConnectedLayer";
- }
- std::cout << " Type: " << _input->info()->data_type() << " Input Shape: " << _input->info()->tensor_shape() << " Weights shape: " << _weights.info().tensor_shape() << " Biases Shape: " <<
- _biases.info().tensor_shape() << " Output Shape: " << _output->info()->tensor_shape() << std::endl;
-}
diff --git a/src/graph/nodes/L2NormalizeLayer.cpp b/src/graph/nodes/L2NormalizeLayer.cpp
new file mode 100644
index 0000000..46d1552
--- /dev/null
+++ b/src/graph/nodes/L2NormalizeLayer.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/L2NormalizeLayer.h"
+
+#include "arm_compute/core/Logger.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLL2Normalize.h"
+#include "arm_compute/runtime/NEON/functions/NEL2Normalize.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename L2NormalizeType, typename TensorType, TargetHint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
+{
+ auto l2norm = arm_compute::support::cpp14::make_unique<L2NormalizeType>();
+ l2norm->configure(
+ dynamic_cast<TensorType *>(input),
+ dynamic_cast<TensorType *>(output),
+ axis,
+ epsilon);
+
+ return std::move(l2norm);
+}
+
+template <TargetHint target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, unsigned int axis, float epsilon);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
+{
+ return instantiate_function<arm_compute::CLL2Normalize, arm_compute::ICLTensor, TargetHint::OPENCL>(input, output, axis, epsilon);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
+{
+ return instantiate_function<arm_compute::NEL2Normalize, arm_compute::ITensor, TargetHint::NEON>(input, output, axis, epsilon);
+}
+} // namespace
+
+std::unique_ptr<arm_compute::IFunction> L2NormalizeLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+{
+ std::unique_ptr<arm_compute::IFunction> func;
+ _target_hint = ctx.hints().target_hint();
+
+ if(_target_hint == TargetHint::OPENCL)
+ {
+ func = instantiate<TargetHint::OPENCL>(input, output, _axis, _epsilon);
+ ARM_COMPUTE_LOG("Instantiating CLL2NormalizeLayer");
+ }
+ else
+ {
+ func = instantiate<TargetHint::NEON>(input, output, _axis, _epsilon);
+ ARM_COMPUTE_LOG("Instantiating NEL2NormalizeLayer");
+ }
+
+ ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+
+ return func;
+}
diff --git a/src/graph/nodes/NormalizationLayer.cpp b/src/graph/nodes/NormalizationLayer.cpp
new file mode 100644
index 0000000..47f0891
--- /dev/null
+++ b/src/graph/nodes/NormalizationLayer.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/NormalizationLayer.h"
+
+#include "arm_compute/core/Logger.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename NormalizationType, typename TensorType, TargetHint target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
+{
+ auto norm = arm_compute::support::cpp14::make_unique<NormalizationType>();
+ norm->configure(
+ dynamic_cast<TensorType *>(input),
+ dynamic_cast<TensorType *>(output),
+ norm_info);
+
+ return std::move(norm);
+}
+
+template <TargetHint target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
+{
+ return instantiate_function<arm_compute::CLNormalizationLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, output, norm_info);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
+{
+ return instantiate_function<arm_compute::NENormalizationLayer, arm_compute::Tensor, TargetHint::NEON>(input, output, norm_info);
+}
+} // namespace
+
+NormalizationLayer::NormalizationLayer(const NormalizationLayerInfo norm_info)
+ : _norm_info(norm_info)
+{
+}
+
+std::unique_ptr<arm_compute::IFunction> NormalizationLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+{
+ std::unique_ptr<arm_compute::IFunction> func;
+ _target_hint = ctx.hints().target_hint();
+
+ if(_target_hint == TargetHint::OPENCL)
+ {
+ func = instantiate<TargetHint::OPENCL>(input, output, _norm_info);
+ ARM_COMPUTE_LOG("Instantiating CLNormalizationLayer");
+ }
+ else
+ {
+ func = instantiate<TargetHint::NEON>(input, output, _norm_info);
+ ARM_COMPUTE_LOG("Instantiating NENormalizationLayer");
+ }
+
+ ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << " Normalization info: " << _norm_info
+ << std::endl);
+
+ return func;
+}
diff --git a/src/graph/nodes/PoolingLayer.cpp b/src/graph/nodes/PoolingLayer.cpp
index f29332f..317cf4d 100644
--- a/src/graph/nodes/PoolingLayer.cpp
+++ b/src/graph/nodes/PoolingLayer.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/graph/nodes/PoolingLayer.h"
+#include "arm_compute/core/Logger.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
@@ -34,7 +35,7 @@
namespace
{
-template <typename PoolingType, typename TensorType, Hint hint>
+template <typename PoolingType, typename TensorType, TargetHint target_hint>
std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
{
auto pool = arm_compute::support::cpp14::make_unique<PoolingType>();
@@ -46,19 +47,19 @@
return std::move(pool);
}
-template <Hint hint>
+template <TargetHint target_hint>
std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
{
- return instantiate_function<arm_compute::CLPoolingLayer, arm_compute::CLTensor, Hint::OPENCL>(input, output, pool_info);
+ return instantiate_function<arm_compute::CLPoolingLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, output, pool_info);
}
template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
{
- return instantiate_function<arm_compute::NEPoolingLayer, arm_compute::Tensor, Hint::NEON>(input, output, pool_info);
+ return instantiate_function<arm_compute::NEPoolingLayer, arm_compute::Tensor, TargetHint::NEON>(input, output, pool_info);
}
} // namespace
@@ -67,38 +68,26 @@
{
}
-std::unique_ptr<arm_compute::IFunction> PoolingLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> PoolingLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
{
std::unique_ptr<arm_compute::IFunction> func;
- _hint = hint;
- _input = input;
- _output = output;
+ _target_hint = ctx.hints().target_hint();
- if(_hint == Hint::OPENCL)
+ if(_target_hint == TargetHint::OPENCL)
{
- func = instantiate<Hint::OPENCL>(input, output, _pool_info);
+ func = instantiate<TargetHint::OPENCL>(input, output, _pool_info);
+ ARM_COMPUTE_LOG("Instantiating CLPoolingLayer");
}
else
{
- func = instantiate<Hint::NEON>(input, output, _pool_info);
+ func = instantiate<TargetHint::NEON>(input, output, _pool_info);
+ ARM_COMPUTE_LOG("Instantiating NEPoolingLayer");
}
+ ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << " Pooling info: " << _pool_info << std::endl);
+
return func;
}
-
-void PoolingLayer::print_info()
-{
- if(_hint == Hint::OPENCL)
- {
- std::cout << "Instantiating CLPoolingLayer";
- }
- else
- {
- std::cout << "Instantiating NEPoolingLayer";
- }
-
- std::cout << " Data Type: " << _input->info()->data_type()
- << " Input shape: " << _input->info()->tensor_shape()
- << " Output shape: " << _output->info()->tensor_shape()
- << " Pooling info: " << _pool_info << std::endl;
-}
diff --git a/src/graph/nodes/SoftmaxLayer.cpp b/src/graph/nodes/SoftmaxLayer.cpp
index fee8897..8628244 100644
--- a/src/graph/nodes/SoftmaxLayer.cpp
+++ b/src/graph/nodes/SoftmaxLayer.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/graph/nodes/SoftmaxLayer.h"
+#include "arm_compute/core/Logger.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
@@ -34,7 +35,7 @@
namespace
{
-template <typename SoftmaxType, typename TensorType, Hint hint>
+template <typename SoftmaxType, typename TensorType, TargetHint hint>
std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output)
{
auto softmax = arm_compute::support::cpp14::make_unique<SoftmaxType>();
@@ -45,53 +46,42 @@
return std::move(softmax);
}
-template <Hint hint>
+template <TargetHint target_hint>
std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output);
template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output)
{
- return instantiate_function<arm_compute::CLSoftmaxLayer, arm_compute::CLTensor, Hint::OPENCL>(input, output);
+ return instantiate_function<arm_compute::CLSoftmaxLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, output);
}
template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output)
{
- return instantiate_function<arm_compute::NESoftmaxLayer, arm_compute::Tensor, Hint::NEON>(input, output);
+ return instantiate_function<arm_compute::NESoftmaxLayer, arm_compute::Tensor, TargetHint::NEON>(input, output);
}
} // namespace
-std::unique_ptr<arm_compute::IFunction> SoftmaxLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> SoftmaxLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
{
std::unique_ptr<arm_compute::IFunction> func;
- _hint = hint;
- _input = input;
- _output = output;
+ _target_hint = ctx.hints().target_hint();
- if(_hint == Hint::OPENCL)
+ if(_target_hint == TargetHint::OPENCL)
{
- func = instantiate<Hint::OPENCL>(input, output);
+ func = instantiate<TargetHint::OPENCL>(input, output);
+ ARM_COMPUTE_LOG("Instantiating CLSoftmaxLayer");
}
else
{
- func = instantiate<Hint::NEON>(input, output);
+ func = instantiate<TargetHint::NEON>(input, output);
+ ARM_COMPUTE_LOG("Instantiating NESoftmaxLayer");
}
+ ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+ << " Input shape: " << input->info()->tensor_shape()
+ << " Output shape: " << output->info()->tensor_shape()
+ << std::endl);
+
return func;
}
-
-void SoftmaxLayer::print_info()
-{
- if(_hint == Hint::OPENCL)
- {
- std::cout << "Instantiating CLSoftmaxLayer";
- }
- else
- {
- std::cout << "Instantiating NESoftmaxLayer";
- }
- std::cout << " Data Type: " << _input->info()->data_type()
- << " Input shape: " << _input->info()->tensor_shape()
- << " Output shape: " << _output->info()->tensor_shape()
- << std::endl;
-}
diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp
index f42627f..89e44ca 100644
--- a/src/runtime/CL/functions/CLDepthConcatenate.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenate.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
@@ -51,6 +52,11 @@
_concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
_border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+ TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+
for(unsigned int i = 0; i < _num_inputs; i++)
{
_concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 9be1df6..c27ff2f 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -162,7 +162,7 @@
_edge_trace.configure(&_nonmax, output);
// Fill border with "No edge" to stop recursion in edge trace
- _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, 0);
+ _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
// Allocate intermediate tensors
_nonmax.allocator()->allocate();
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 40862fc..f34f497 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -190,8 +190,9 @@
{
if(_are_weights_reshaped)
{
- mat_weights_cols = weights_info.num_kernels();
- mat_weights_rows = weights->info()->dimension(0) / 4 + (_has_bias ? 1 : 0);
+ const unsigned int transpose_width = 16 / input->info()->element_size();
+ mat_weights_cols = weights_info.num_kernels();
+ mat_weights_rows = weights->info()->dimension(0) / transpose_width + (_has_bias ? 1 : 0);
}
else
{
diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
index ddf7e90..f8ad2ab 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenate.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Types.h"
@@ -48,11 +49,16 @@
_concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
_border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+ TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+
unsigned int depth_offset = 0;
for(unsigned int i = 0; i < _num_inputs; ++i)
{
_concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
- _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
depth_offset += inputs_vector.at(i)->info()->dimension(2);
}
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index b831a6a..52a4cc1 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -82,7 +82,7 @@
}
// Add zero padding XY
- _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
}
void NEDirectConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
new file mode 100644
index 0000000..408eff5
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/Size2D.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
+ k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+ _kernel = std::move(k);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
index 2e94ed5..fa8aaeb 100644
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp
@@ -36,5 +36,5 @@
auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>();
k->configure(input, output);
_kernel = std::move(k);
- _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, 0);
+ _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
}
diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
index 3b59820..0854c9d 100644
--- a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
+++ b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
@@ -38,10 +38,10 @@
if(border_mode != BorderMode::UNDEFINED)
{
- _border_handler.configure(input, BorderSize(1), BorderMode::CONSTANT, 0);
+ _border_handler.configure(input, BorderSize(1), BorderMode::CONSTANT, static_cast<float>(0.f));
}
else
{
- _border_handler.configure(input, BorderSize(1), BorderMode::UNDEFINED, 0);
+ _border_handler.configure(input, BorderSize(1), BorderMode::UNDEFINED, static_cast<float>(0.f));
}
}
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 4c4e11f..f8a85b9 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -23,19 +23,36 @@
*/
#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
-#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "support/ToolchainSupport.h"
using namespace arm_compute;
+NEPoolingLayer::NEPoolingLayer()
+ : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false)
+{
+}
+
void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
{
+ // Check if we have Global Pooling Layer
+ _is_global_pooling_layer = (input->info()->dimension(0) == pool_info.pool_size()) && (input->info()->dimension(1) == pool_info.pool_size());
+
// Configure pooling kernel
- auto k = arm_compute::support::cpp14::make_unique<NEPoolingLayerKernel>();
- k->configure(input, output, pool_info);
- _kernel = std::move(k);
+ _pooling_layer_kernel.configure(input, output, pool_info);
// Configure border depending on operation required
BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
- _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0));
+ _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, PixelValue(static_cast<float>(0.f)));
}
+
+void NEPoolingLayer::run()
+{
+ // Fill border
+ NEScheduler::get().schedule(&_border_handler, Window::DimY);
+
+ // Run pooling layer
+ NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 45c3e5d..f1a9145 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -74,7 +74,7 @@
// Configure fill border kernel
BorderSize fill_border_size = (axis == 0) ? _reduction_kernel.border_size() : BorderSize();
BorderMode fill_border_mode = reduction_operation_border_mode(op);
- _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(0));
+ _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(static_cast<float>(0.f)));
}
void NEReductionOperation::run()
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index be81641..1dd2511 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -51,32 +51,34 @@
void OMPScheduler::set_num_threads(unsigned int num_threads)
{
const unsigned int num_cores = omp_get_max_threads();
- _num_threads = num_threads == 0 ? num_cores : num_threads;
+ _num_threads = (num_threads == 0) ? num_cores : num_threads;
}
void OMPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
{
ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
+ ThreadInfo info;
+ info.cpu_info = _info;
+
const Window &max_window = kernel->window();
const unsigned int num_iterations = max_window.num_iterations(split_dimension);
- const unsigned int num_threads = std::min(num_iterations, _num_threads);
+ info.num_threads = std::min(num_iterations, _num_threads);
- if(!kernel->is_parallelisable() || 1 == num_threads)
+ if(!kernel->is_parallelisable() || info.num_threads == 1)
{
- kernel->run(max_window);
+ kernel->run(max_window, info);
}
else
{
- #pragma omp parallel num_threads(num_threads)
+ #pragma omp parallel num_threads(info.num_threads)
{
#pragma omp for
- for(unsigned int t = 0; t < num_threads; ++t)
+ for(int t = 0; t < info.num_threads; ++t)
{
- Window win = max_window.split_window(split_dimension, t, num_threads);
- win.set_thread_id(t);
- win.set_num_threads(num_threads);
- kernel->run(win);
+ Window win = max_window.split_window(split_dimension, t, info.num_threads);
+ info.thread_id = t;
+ kernel->run(win, info);
}
}
}
diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
index 1b06117..81de782 100644
--- a/src/runtime/Utils.cpp
+++ b/src/runtime/Utils.cpp
@@ -28,6 +28,10 @@
using namespace arm_compute;
+static const std::string information =
+#include "arm_compute_version.embed"
+ ;
+
const std::string &arm_compute::string_from_scheduler_type(Scheduler::Type t)
{
static std::map<Scheduler::Type, const std::string> scheduler_type_map =