arm_compute v17.06
diff --git a/src/runtime/CL/CLHOG.cpp b/src/runtime/CL/CLHOG.cpp
new file mode 100644
index 0000000..3f5266c
--- /dev/null
+++ b/src/runtime/CL/CLHOG.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLHOG.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOG::CLHOG()
+ : _info(), _buffer()
+{
+}
+
+void CLHOG::init(const HOGInfo &input)
+{
+ ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+ _info = input;
+ _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info()->descriptor_size() * sizeof(float));
+}
+
+void CLHOG::free()
+{
+ ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+
+ _buffer = cl::Buffer();
+}
+
+const HOGInfo *CLHOG::info() const
+{
+ return &_info;
+}
+
+const cl::Buffer &CLHOG::cl_buffer() const
+{
+ return _buffer;
+}
+
+void CLHOG::map(bool blocking)
+{
+ ARM_COMPUTE_ERROR_ON(descriptor() != nullptr);
+ ICLHOG::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLHOG::unmap()
+{
+ ARM_COMPUTE_ERROR_ON(descriptor() == nullptr);
+ ICLHOG::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLHOG::do_map(cl::CommandQueue &q, bool blocking)
+{
+ ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+ return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size()));
+}
+
+void CLHOG::do_unmap(cl::CommandQueue &q)
+{
+ ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+ q.enqueueUnmapMemObject(_buffer, descriptor());
+}
\ No newline at end of file
diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp
new file mode 100644
index 0000000..b9e8739
--- /dev/null
+++ b/src/runtime/CL/CLMultiHOG.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMultiHOG.h"
+
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+CLMultiHOG::CLMultiHOG(size_t num_models)
+ : _num_models(num_models), _model(arm_compute::cpp14::make_unique<CLHOG[]>(_num_models))
+{
+}
+
+size_t CLMultiHOG::num_models() const
+{
+ return _num_models;
+}
+
+ICLHOG *CLMultiHOG::cl_model(size_t index)
+{
+ ARM_COMPUTE_ERROR_ON(index >= _num_models);
+ return (_model.get() + index);
+}
+
+const ICLHOG *CLMultiHOG::cl_model(size_t index) const
+{
+ ARM_COMPUTE_ERROR_ON(index >= _num_models);
+ return (_model.get() + index);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 1f3dbbe..fe25ce5 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -28,7 +28,7 @@
using namespace arm_compute;
CLScheduler::CLScheduler()
- : _context(), _queue()
+ : _context(), _queue(), _target(GPUTarget::MIDGARD)
{
}
diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
new file mode 100644
index 0000000..b228c0a
--- /dev/null
+++ b/src/runtime/CL/CLSubTensor.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+ : _parent(nullptr), _info()
+{
+ ARM_COMPUTE_ERROR_ON(parent == nullptr);
+ _info = SubTensorInfo(parent->info(), tensor_shape, coords);
+ _parent = parent;
+}
+
+ITensorInfo *CLSubTensor::info() const
+{
+ return &_info;
+}
+
+ITensorInfo *CLSubTensor::info()
+{
+ return &_info;
+}
+
+const cl::Buffer &CLSubTensor::cl_buffer() const
+{
+ ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+ return _parent->cl_buffer();
+}
+
+ICLTensor *CLSubTensor::parent()
+{
+ return _parent;
+}
+
+void CLSubTensor::map(bool blocking)
+{
+ ICLTensor::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLSubTensor::unmap()
+{
+ ICLTensor::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking)
+{
+ ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
+ return static_cast<uint8_t *>(q.enqueueMapBuffer(cl_buffer(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->total_size()));
+}
+
+void CLSubTensor::do_unmap(cl::CommandQueue &q)
+{
+ ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
+ q.enqueueUnmapMemObject(cl_buffer(), buffer());
+}
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
new file mode 100644
index 0000000..3df673c
--- /dev/null
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLBatchNormalizationLayer::CLBatchNormalizationLayer()
+ : _norm_kernel()
+{
+}
+
+void CLBatchNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
+{
+ _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+}
+
+void CLBatchNormalizationLayer::run()
+{
+ CLScheduler::get().enqueue(_norm_kernel, true);
+}
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index bb47bf9..f0bbc35 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -33,83 +33,155 @@
using namespace arm_compute;
-CLConvolutionLayer::CLConvolutionLayer()
- : _input_im2col_kernel(), _weights_reshape_kernel(), _input_interleave_kernel(), _weights_transposed_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
- _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _is_first_run(false), _has_bias(false), _is_fc(false)
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
+ : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
{
}
-void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases, output);
+ ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+ ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+ ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+ }
+
+ const bool _has_bias = (biases != nullptr);
+
+ _transpose1xW = transpose1xW;
+
+ if(transpose1xW)
+ {
+ // Create tensor to store the reshaped weights
+ const unsigned int mat_weights_cols = weights->info()->dimension(3);
+ const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+ TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+ const DataType dt = weights->info()->data_type();
+ TensorInfo info_wr(shape_wr, 1, dt);
+
+ _weights_reshaped.allocator()->init(info_wr);
+ _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+ _weights_transposed_kernel.configure(&_weights_reshaped, output);
+ _weights_reshaped.allocator()->allocate();
+ }
+ else
+ {
+ _weights_reshape_kernel.configure(weights, biases, output);
+ }
+}
+
+void CLConvolutionLayerReshapeWeights::run()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+ CLScheduler::get().enqueue(_weights_reshape_kernel);
+ if(_transpose1xW)
+ {
+ CLScheduler::get().enqueue(_weights_transposed_kernel);
+ }
+}
+
+CLConvolutionLayer::CLConvolutionLayer()
+ : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
+ _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+{
+}
+
+void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
if(biases != nullptr)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+ ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
}
- _has_bias = (biases != nullptr);
- _is_first_run = true;
+ _has_bias = (biases != nullptr);
+ _are_weights_reshaped = weights_info.are_reshaped();
// Get parameters for conv_info
- unsigned int stride_x, stride_y, pad_x, pad_y = 0;
+ unsigned int stride_x = 0;
+ unsigned int stride_y = 0;
+ unsigned int pad_x = 0;
+ unsigned int pad_y = 0;
std::tie(stride_x, stride_y) = conv_info.stride();
std::tie(pad_x, pad_y) = conv_info.pad();
- bool is_same_dimension = true;
- // Make sure the input and weights have same low three dimensions
- for(int i = 0; i < 3; i++)
- {
- is_same_dimension = (is_same_dimension) && (input->info()->dimension(i) == weights->info()->dimension(i));
- }
-
- // Run the fully connected path if is_same_dimension is true and conv_stride_x/conv_stride_y are 1, and conv_pad_x/conv_pad_y are 0 and skip col2im
- _is_fc = (is_same_dimension) && ((stride_x & stride_y) == 1) && ((pad_x | pad_y) == 0);
-
// Get convolved dimensions
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+
+ const unsigned int kernel_width = _are_weights_reshaped ? weights_info.kernel_size() : weights->info()->dimension(0);
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
stride_x, stride_y, pad_x, pad_y, conv_info.round());
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+ // Check if its a "fully connected" convolution
+ _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+
// Create tensor to store the reshaped weights
- const size_t mat_weights_cols = weights->info()->dimension(3);
- const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
- const TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
- _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
-
- // Create tensor to store transposed weights
- TensorShape shape_wt(mat_weights_rows * 4, static_cast<size_t>(std::ceil(mat_weights_cols / 4.f)));
- TensorInfo info_wt(shape_wt, 1, weights->info()->data_type());
- _weights_transposed.allocator()->init(info_wt);
-
+ size_t mat_weights_cols = weights->info()->dimension(3);
+ size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+ if(_are_weights_reshaped)
+ {
+ mat_weights_cols = output->info()->dimension(2);
+ const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
+ mat_weights_rows = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+ }
+ else
+ {
+ if(_is_fully_connected_convolution)
+ {
+ // Create tensor to store the reshaped weights
+ TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+ TensorInfo info_wr(shape_wr, 1, weights->info()->data_type());
+ _weights_reshaped.allocator()->init(info_wr);
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, false);
+ weights = &_weights_reshaped;
+ }
+ else
+ {
+ // Create tensor to store transposed weights
+ TensorShape shape_wt(mat_weights_rows * 4, static_cast<size_t>(std::ceil(mat_weights_cols / 4.f)));
+ TensorInfo info_wt(shape_wt, 1, weights->info()->data_type());
+ _weights_transposed.allocator()->init(info_wt);
+ _reshape_weights.configure(weights, biases, &_weights_transposed, true);
+ weights = &_weights_transposed;
+ }
+ }
// Create tensor to store im2col reshaped inputs
const size_t mat_input_cols = mat_weights_rows;
- const size_t mat_input_rows = _is_fc ? (input->info()->dimension(3)) : (conv_w * conv_h);
+ const size_t mat_input_rows = conv_w * conv_h;
TensorShape shape_im2col = input->info()->tensor_shape();
shape_im2col.set(0, mat_input_cols);
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
- if(_is_fc)
- {
- shape_im2col.set(3, 1);
- }
_input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
- // Create tensor to prepare input tensor for GEMM
- TensorShape shape_interleaved = shape_im2col;
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+ // Create tensor (interleave) to prepare input tensor for GEMM
+ if(!_is_fully_connected_convolution)
+ {
+ TensorShape shape_interleaved = shape_im2col;
+ shape_interleaved.set(0, shape_interleaved.x() * 4);
+ shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4.f));
+ _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+ }
// Create GEMM output tensor
TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
@@ -119,48 +191,57 @@
// Configure kernels
_input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
- _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
- _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
- _weights_transposed_kernel.configure(&_weights_reshaped, &_weights_transposed);
- if(_is_fc)
+ _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+ if(_is_fully_connected_convolution)
{
- _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, output, 1.0f);
+ _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
}
else
{
- _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, &_gemm_output, 1.0f);
- _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+ _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+ _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
}
- // Allocate intermediate tensors
- _weights_reshaped.allocator()->allocate();
- _weights_transposed.allocator()->allocate();
+ if(!_are_weights_reshaped)
+ {
+ if(!_is_fully_connected_convolution)
+ {
+ _weights_transposed.allocator()->allocate();
+ }
+ else
+ {
+ _weights_reshaped.allocator()->allocate();
+ }
+ }
+
_input_im2col_reshaped.allocator()->allocate();
- _input_interleaved_reshaped.allocator()->allocate();
+ if(!_is_fully_connected_convolution)
+ {
+ _input_interleaved_reshaped.allocator()->allocate();
+ }
_gemm_output.allocator()->allocate();
}
void CLConvolutionLayer::run()
{
// Run weights reshaping (Runs once for every configure)
- if(_is_first_run)
+ if(!_are_weights_reshaped)
{
- _is_first_run = false;
- CLScheduler::get().enqueue(_weights_reshape_kernel);
- CLScheduler::get().enqueue(_weights_transposed_kernel);
+ _are_weights_reshaped = true;
+ _reshape_weights.run();
}
// Run input reshaping
CLScheduler::get().enqueue(_input_im2col_kernel);
- CLScheduler::get().enqueue(_input_interleave_kernel);
+ if(!_is_fully_connected_convolution)
+ {
+ CLScheduler::get().enqueue(_input_interleave_kernel);
+ }
// Runs matrix multiply on reshaped matrices
CLScheduler::get().enqueue(_mm_kernel);
// Reshape output matrix
-
- if(!_is_fc)
- {
- CLScheduler::get().enqueue(_output_col2im_kernel, false);
- }
+ CLScheduler::get().enqueue(_output_col2im_kernel, false);
}
diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp
new file mode 100644
index 0000000..d967d98
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthConcatenate.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDepthConcatenate::CLDepthConcatenate()
+ : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+{
+}
+
+void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+ _num_inputs = inputs_vector.size();
+
+ unsigned int depth_offset = 0;
+
+ _concat_kernels_vector = arm_compute::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
+ _border_handlers_vector = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+
+ for(unsigned int i = 0; i < _num_inputs; i++)
+ {
+ _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+ _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+ depth_offset += inputs_vector.at(i)->info()->dimension(2);
+ }
+}
+
+void CLDepthConcatenate::run()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ for(unsigned i = 0; i < _num_inputs; i++)
+ {
+ CLScheduler::get().enqueue(_border_handlers_vector[i], false);
+ CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+ }
+}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 08e18df..57d57d5 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -31,35 +31,99 @@
using namespace arm_compute;
+CLFullyConnectedLayerReshapeWeights::CLFullyConnectedLayerReshapeWeights()
+ : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+{
+}
+
+void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
+ ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
+ _transpose_weights = transpose_weights;
+ _is_batched_fc_layer = is_batched_fc_layer;
+
+ // Check if we need to transpose the weights
+ if(_transpose_weights)
+ {
+ if(_is_batched_fc_layer)
+ {
+ // Initialize the output tensor for transpose
+ TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
+ _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+ _transpose_kernel.configure(input, &_transpose_output);
+
+ // Configure transpose 1xW kernel
+ _transpose1xW_kernel.configure(&_transpose_output, output);
+
+ // Allocate temporary tensor used for transposing the weights
+ _transpose_output.allocator()->allocate();
+ }
+ else
+ {
+ _transpose_kernel.configure(input, output);
+ }
+ }
+ else
+ {
+ if(_is_batched_fc_layer)
+ {
+ // Configure transpose 1xW kernel
+ _transpose1xW_kernel.configure(input, output);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
+ }
+ }
+}
+
+void CLFullyConnectedLayerReshapeWeights::run()
+{
+ if(_transpose_weights)
+ {
+ CLScheduler::get().enqueue(_transpose_kernel, _is_batched_fc_layer);
+ }
+ if(_is_batched_fc_layer)
+ {
+ CLScheduler::get().enqueue(_transpose1xW_kernel);
+ }
+}
+
CLFullyConnectedLayer::CLFullyConnectedLayer()
- : _im2col_kernel(), _transpose_kernel(), _transpose1xW_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _transpose_output(),
- _transpose1xW_output(), _is_first_run(true), _transpose_weights(true), _fc_after_conv(true), _batched_fc_layer(false), _accumulate_biases(false)
+ : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
+ _are_weights_reshaped(true), _is_fc_after_conv(true), _is_batched_fc_layer(false), _accumulate_biases(false)
{
}
void CLFullyConnectedLayer::configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
// Initialize output tensor for im2col
TensorShape shape_im2col;
- shape_im2col.set(0, weights->info()->dimension(1));
+ shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
shape_im2col.set(1, input->info()->dimension(3));
shape_im2col.set(2, input->info()->dimension(4));
shape_im2col.set(3, input->info()->dimension(5));
- _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+ _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
// Initialize output tensor for interleave 4x4
TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
shape_interleaved.set(0, shape_interleaved.x() * 4);
shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
-
- // Initialize output tensor for transpose 1xW
- TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
- _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+ _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
// Configure im2col kernel
_im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
@@ -67,55 +131,49 @@
// Configure interleave4x4 kernel
_interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
- // Configure transpose 1xW kernel
- _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
-
// Configure matrix multiply kernel
- _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+ _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
// Allocate the tensors once all the configure methods have been called
_im2col_output.allocator()->allocate();
_interleave4x4_output.allocator()->allocate();
- _transpose1xW_output.allocator()->allocate();
}
void CLFullyConnectedLayer::configure_fc_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
// Initialize output tensor for interleave 4x4
TensorShape shape_interleaved = input->info()->tensor_shape();
shape_interleaved.set(0, shape_interleaved.x() * 4);
shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
-
- // Initialize output tensor for transpose 1xW
- TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
- _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+ _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
// Configure interleave4x4 kernel
_interleave4x4_kernel.configure(input, &_interleave4x4_output);
- // Configure transpose 1xW kernel
- _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
-
// Configure matrix multiply kernel
- _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+ _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
// Allocate the tensors once all the configure methods have been called
_interleave4x4_output.allocator()->allocate();
- _transpose1xW_output.allocator()->allocate();
}
void CLFullyConnectedLayer::configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
// Initialize output tensor for im2col
TensorShape shape_im2col;
- shape_im2col.set(0, weights->info()->dimension(1));
+ shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
shape_im2col.set(1, 1);
- _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+ _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
// Configure im2col kernel
_im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
@@ -135,20 +193,20 @@
_mm_kernel.configure(input, weights, output, 1.0f);
}
-void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights)
+void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
- const ICLTensor *weights_to_use = weights;
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
- _is_first_run = true;
- _transpose_weights = transpose_weights;
- _fc_after_conv = true;
- _batched_fc_layer = false;
- _accumulate_biases = false;
+ _are_weights_reshaped = are_weights_reshaped;
+ _is_fc_after_conv = true;
+ _is_batched_fc_layer = false;
+ _accumulate_biases = false;
if(biases != nullptr)
{
@@ -160,17 +218,6 @@
_accumulate_biases_kernel.configure(output, biases);
}
- // Check if we need to transpose the weights
- if(_transpose_weights)
- {
- // Initialize the output tensor for transpose
- TensorShape shape_transposed(weights->info()->dimension(1), weights->info()->dimension(0));
- _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, weights->info()->data_type()));
- _transpose_kernel.configure(weights, &_transpose_output);
-
- weights_to_use = &_transpose_output;
- }
-
// With the Fully Connected layer we can have 4 different cases:
// 1) Convolution layer -> Fully Connected layer without batches
// 2) Fully Connected layer -> Fully Connected layer without batches
@@ -178,15 +225,54 @@
// 4) Fully Connected layer -> Fully Connected layer with batches
// Check if we have a fully connected layer with batches
- _batched_fc_layer = (output->info()->dimension(1) > 1);
+ _is_batched_fc_layer = (output->info()->dimension(1) > 1);
- if(_batched_fc_layer)
+ const ICLTensor *weights_to_use = weights;
+
+ if(!are_weights_reshaped)
{
- _fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
- input->info()->tensor_shape().cend(),
- output->info()->tensor_shape().cbegin() + 1));
+ if((transpose_weights || _is_batched_fc_layer))
+ {
+ weights_to_use = &_reshape_weights_output;
- if(_fc_after_conv)
+ if(transpose_weights)
+ {
+ if(_is_batched_fc_layer)
+ {
+ const float transpose_width = 16.0f / input->info()->element_size();
+ TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _reshape_weights_output.allocator()->init(info_wt);
+ }
+ else
+ {
+ TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _reshape_weights_output.allocator()->init(info_wt);
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
+
+ const float transpose_width = 16.0f / input->info()->element_size();
+ TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _reshape_weights_output.allocator()->init(info_wt);
+ }
+
+ // Reshape the weights
+ _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+ }
+ }
+
+ if(_is_batched_fc_layer)
+ {
+ _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+ input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1));
+
+ if(_is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer with batches
configure_conv_fc_wb(input, weights_to_use, output);
@@ -199,9 +285,10 @@
}
else
{
- _fc_after_conv = (weights_to_use->info()->dimension(1) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+ // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
+ _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
- if(_fc_after_conv)
+ if(_is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer without batches
configure_conv_fc_nb(input, weights_to_use, output);
@@ -213,39 +300,34 @@
}
}
- // Allocate the transpose tensor if the transpose_weights flag is true and once all the configure methods have been called
- if(_transpose_weights)
+ // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
+ if(!are_weights_reshaped)
{
- _transpose_output.allocator()->allocate();
+ if(transpose_weights || _is_batched_fc_layer)
+ {
+ // Allocate the tensor for the weights reshaped
+ _reshape_weights_output.allocator()->allocate();
+ }
}
}
void CLFullyConnectedLayer::run()
{
- // The reshape of the weights happens only once
- if(_is_first_run)
+ // Reshape of the weights (happens only once)
+ if(!_are_weights_reshaped)
{
- _is_first_run = false;
-
- if(_transpose_weights)
- {
- CLScheduler::get().enqueue(_transpose_kernel);
- }
-
- if(_batched_fc_layer)
- {
- CLScheduler::get().enqueue(_transpose1xW_kernel);
- }
+ _are_weights_reshaped = true;
+ _reshape_weights_kernel.run();
}
// Linearize input if it comes from a convolutional layer
- if(_fc_after_conv)
+ if(_is_fc_after_conv)
{
CLScheduler::get().enqueue(_im2col_kernel, false);
}
// Interleave input
- if(_batched_fc_layer)
+ if(_is_batched_fc_layer)
{
CLScheduler::get().enqueue(_interleave4x4_kernel, false);
}
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
new file mode 100644
index 0000000..b1b5a03
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOGDescriptor::CLHOGDescriptor()
+ : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+{
+}
+
+void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON(nullptr == output);
+ ARM_COMPUTE_ERROR_ON(nullptr == hog);
+
+ const HOGInfo *hog_info = hog->info();
+ const size_t width = input->info()->dimension(Window::DimX);
+ const size_t height = input->info()->dimension(Window::DimY);
+ const size_t num_bins = hog_info->num_bins();
+
+ Size2D cell_size = hog_info->cell_size();
+
+ // Calculate number of cells along the x and y directions for the hog_space
+ const size_t num_cells_x = width / cell_size.width;
+ const size_t num_cells_y = height / cell_size.height;
+
+ // TensorShape of the input image
+ const TensorShape &shape_img = input->info()->tensor_shape();
+
+ // TensorShape of the hog space
+ TensorShape shape_hog_space = input->info()->tensor_shape();
+ shape_hog_space.set(Window::DimX, num_cells_x);
+ shape_hog_space.set(Window::DimY, num_cells_y);
+
+ // Intitialize tensors for magnitude, phase and hog space
+ TensorInfo info_mag(shape_img, Format::S16);
+ _mag.allocator()->init(info_mag);
+
+ TensorInfo info_phase(shape_img, Format::U8);
+ _phase.allocator()->init(info_phase);
+
+ TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+ _hog_space.allocator()->init(info_space);
+
+ // Initialise gradient kernel
+ _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+
+ // Initialise orientation binning kernel
+ _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
+
+ // Initialize HOG norm kernel
+ _block_norm.configure(&_hog_space, output, hog->info());
+
+ // Allocate intermediate tensors
+ _mag.allocator()->allocate();
+ _phase.allocator()->allocate();
+ _hog_space.allocator()->allocate();
+}
+
+void CLHOGDescriptor::run()
+{
+ // Run gradient
+ _gradient.run();
+
+ // Run orientation binning
+ CLScheduler::get().enqueue(_orient_bin, false);
+
+ // Run block normalization
+ CLScheduler::get().enqueue(_block_norm);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp
new file mode 100644
index 0000000..8eb5e42
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGDetector.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+
+CLHOGDetector::CLHOGDetector()
+ : _hog_detector_kernel(), _detection_windows(nullptr), _num_detection_windows()
+{
+}
+
+void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
+{
+ _detection_windows = detection_windows;
+
+ // Allocate buffer for storing the number of detected objects
+ _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
+
+ // Configure HOGDetectorKernel
+ _hog_detector_kernel.configure(input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
+}
+
+void CLHOGDetector::run()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ // Reset number of detections
+ const unsigned int init_num_detection_windows = _detection_windows->num_values();
+ q.enqueueWriteBuffer(_num_detection_windows, CL_FALSE, 0, sizeof(unsigned int), &init_num_detection_windows);
+
+ // Run CLHOGDetectorKernel
+ CLScheduler::get().enqueue(_hog_detector_kernel);
+
+ // Read number of detections
+ unsigned int num_detection_windows = 0;
+ q.enqueueReadBuffer(_num_detection_windows, CL_TRUE, 0, sizeof(unsigned int), &num_detection_windows);
+
+ // Update the number of values stored in _detection_windows
+ _detection_windows->resize(static_cast<size_t>(num_detection_windows));
+
+ q.flush();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
new file mode 100644
index 0000000..2387474
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOGGradient::CLHOGGradient()
+ : _derivative(), _mag_phase(), _gx(), _gy()
+{
+}
+
+void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8);
+
+ const TensorShape &shape_img = input->info()->tensor_shape();
+
+ // Allocate image memory
+ TensorInfo info(shape_img, Format::S16);
+ _gx.allocator()->init(info);
+ _gy.allocator()->init(info);
+
+ // Initialise derivate kernel
+ _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
+
+ // Initialise magnitude/phase kernel
+ if(PhaseType::UNSIGNED == phase_type)
+ {
+ _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
+ }
+ else
+ {
+ _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
+ }
+
+ // Allocate intermediate tensors
+ _gx.allocator()->allocate();
+ _gy.allocator()->allocate();
+}
+
+void CLHOGGradient::run()
+{
+ // Run derivative
+ _derivative.run();
+
+ // Run magnitude/phase kernel
+ CLScheduler::get().enqueue(_mag_phase);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
new file mode 100644
index 0000000..b8f2224
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+using namespace arm_compute;
+
+CLHOGMultiDetection::CLHOGMultiDetection()
+ : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
+ _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+{
+}
+
+void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
+ uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
+ ARM_COMPUTE_ERROR_ON(nullptr == detection_windows);
+ ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models());
+
+ const size_t width = input->info()->dimension(Window::DimX);
+ const size_t height = input->info()->dimension(Window::DimY);
+ const TensorShape &shape_img = input->info()->tensor_shape();
+ const size_t num_models = multi_hog->num_models();
+ PhaseType phase_type = multi_hog->model(0)->info()->phase_type();
+
+ size_t prev_num_bins = multi_hog->model(0)->info()->num_bins();
+ Size2D prev_cell_size = multi_hog->model(0)->info()->cell_size();
+ Size2D prev_block_size = multi_hog->model(0)->info()->block_size();
+ Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride();
+
+ /* Check if CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object
+ *
+ * 1) CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change.
+ * Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+ * 2) CLHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change.
+ * Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+ *
+ * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel
+ * with "input_orient_bin", "input_hog_detect" and "input_block_norm"
+ */
+ std::vector<size_t> input_orient_bin;
+ std::vector<size_t> input_hog_detect;
+ std::vector<std::pair<size_t, size_t>> input_block_norm;
+
+ input_orient_bin.push_back(0);
+ input_hog_detect.push_back(0);
+ input_block_norm.emplace_back(0, 0);
+
+ for(size_t i = 1; i < num_models; ++i)
+ {
+ size_t cur_num_bins = multi_hog->model(i)->info()->num_bins();
+ Size2D cur_cell_size = multi_hog->model(i)->info()->cell_size();
+ Size2D cur_block_size = multi_hog->model(i)->info()->block_size();
+ Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride();
+
+ if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
+ {
+ prev_num_bins = cur_num_bins;
+ prev_cell_size = cur_cell_size;
+ prev_block_size = cur_block_size;
+ prev_block_stride = cur_block_stride;
+
+ // Compute orientation binning and block normalization kernels. Update input to process
+ input_orient_bin.push_back(i);
+ input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+ }
+ else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
+ || (cur_block_stride.height != prev_block_stride.height))
+ {
+ prev_block_size = cur_block_size;
+ prev_block_stride = cur_block_stride;
+
+ // Compute block normalization kernel. Update input to process
+ input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+ }
+
+ // Update input to process for hog detector kernel
+ input_hog_detect.push_back(input_block_norm.size() - 1);
+ }
+
+ _detection_windows = detection_windows;
+ _non_maxima_suppression = non_maxima_suppression;
+ _num_orient_bin_kernel = input_orient_bin.size(); // Number of CLHOGOrientationBinningKernel kernels to compute
+ _num_block_norm_kernel = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
+ _num_hog_detect_kernel = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
+
+ _orient_bin_kernel = arm_compute::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+ _block_norm_kernel = arm_compute::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+ _hog_detect_kernel = arm_compute::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
+ _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+ _hog_space = arm_compute::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
+ _hog_norm_space = arm_compute::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
+
+ // Allocate tensors for magnitude and phase
+ TensorInfo info_mag(shape_img, Format::S16);
+ _mag.allocator()->init(info_mag);
+
+ TensorInfo info_phase(shape_img, Format::U8);
+ _phase.allocator()->init(info_phase);
+
+ // Initialise gradient kernel
+ _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
+
+ // Configure NETensor for the HOG space and orientation binning kernel
+ for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+ {
+ const size_t idx_multi_hog = input_orient_bin[i];
+
+ // Get the corresponding cell size and number of bins
+ const Size2D &cell = multi_hog->model(idx_multi_hog)->info()->cell_size();
+ const size_t num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
+
+ // Calculate number of cells along the x and y directions for the hog_space
+ const size_t num_cells_x = width / cell.width;
+ const size_t num_cells_y = height / cell.height;
+
+ // TensorShape of hog space
+ TensorShape shape_hog_space = input->info()->tensor_shape();
+ shape_hog_space.set(Window::DimX, num_cells_x);
+ shape_hog_space.set(Window::DimY, num_cells_y);
+
+ // Allocate HOG space
+ TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+ _hog_space[i].allocator()->init(info_space);
+
+ // Initialise orientation binning kernel
+ _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+ }
+
+ // Configure CLTensor for the normalized HOG space and block normalization kernel
+ for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+ {
+ const size_t idx_multi_hog = input_block_norm[i].first;
+ const size_t idx_orient_bin = input_block_norm[i].second;
+
+ // Allocate normalized HOG space
+ TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
+ _hog_norm_space[i].allocator()->init(tensor_info);
+
+ // Initialize block normalization kernel
+ _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+ }
+
+ detection_window_strides->map(CLScheduler::get().queue(), true);
+
+ // Configure HOG detector kernel
+ for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+ {
+ const size_t idx_block_norm = input_hog_detect[i];
+
+ _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+ }
+
+ detection_window_strides->unmap(CLScheduler::get().queue());
+
+ // Configure non maxima suppression kernel
+ _non_maxima_kernel->configure(_detection_windows, min_distance);
+
+ // Allocate intermediate tensors
+ _mag.allocator()->allocate();
+ _phase.allocator()->allocate();
+
+ for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+ {
+ _hog_space[i].allocator()->allocate();
+ }
+
+ for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+ {
+ _hog_norm_space[i].allocator()->allocate();
+ }
+}
+
+void CLHOGMultiDetection::run()
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
+
+ // Reset detection window
+ _detection_windows->clear();
+
+ // Run gradient
+ _gradient_kernel.run();
+
+ // Run orientation binning kernel
+ for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+ {
+ CLScheduler::get().enqueue(*(_orient_bin_kernel.get() + i), false);
+ }
+
+ // Run block normalization kernel
+ for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+ {
+ CLScheduler::get().enqueue(*(_block_norm_kernel.get() + i), false);
+ }
+
+ // Run HOG detector kernel
+ for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+ {
+ _hog_detect_kernel[i].run();
+ }
+
+ // Run non-maxima suppression kernel if enabled
+ if(_non_maxima_suppression)
+ {
+ // Map detection windows array before computing non maxima suppression
+ _detection_windows->map(CLScheduler::get().queue(), true);
+ _non_maxima_kernel->run(_non_maxima_kernel->window());
+ _detection_windows->unmap(CLScheduler::get().queue());
+ }
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 6501da3..2db277f 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -34,8 +34,8 @@
#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
-#include "arm_compute/runtime/CPP/CPPScheduler.h"
#include "arm_compute/runtime/ITensorAllocator.h"
+#include "arm_compute/runtime/Scheduler.h"
#include <cmath>
#include <utility>
@@ -148,7 +148,7 @@
// Run corner candidate kernel
_nonmax.map(true);
- CPPScheduler::get().multithread(&_candidates);
+ Scheduler::get().schedule(&_candidates, Window::DimY);
_nonmax.unmap();
_corners->map(CLScheduler::get().queue(), true);
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
new file mode 100644
index 0000000..263fb51
--- /dev/null
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLLocallyConnectedLayer::CLLocallyConnectedLayer()
+ : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+{
+}
+
+void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+ ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
+ }
+
+ bool _has_bias = (biases != nullptr);
+ _is_first_run = true;
+
+ // Get parameters for conv_info
+ unsigned int stride_x = 0;
+ unsigned int stride_y = 0;
+ unsigned int pad_x = 0;
+ unsigned int pad_y = 0;
+ std::tie(stride_x, stride_y) = conv_info.stride();
+ std::tie(pad_x, pad_y) = conv_info.pad();
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+ stride_x, stride_y, pad_x, pad_y, conv_info.round());
+
+ ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+ ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+ // Create tensor to store the reshaped weights
+ const size_t mat_weights_cols = weights->info()->dimension(3);
+ const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+ const size_t mat_weights_num = weights->info()->dimension(4);
+
+ const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+ _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+
+ // Create tensor to store im2col reshaped inputs
+ const size_t mat_input_cols = mat_weights_rows;
+ const size_t mat_input_rows = conv_w * conv_h;
+ TensorShape shape_im2col = input->info()->tensor_shape();
+ shape_im2col.set(0, mat_input_cols);
+ shape_im2col.set(1, mat_input_rows);
+ shape_im2col.set(2, 1);
+
+ _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+ // Create locally connected layer output tensor
+ TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, mat_input_rows);
+ _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+ // Configure kernels
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+ _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+ _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+ _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+ // Allocate intermediate tensors
+ _weights_reshaped.allocator()->allocate();
+ _input_im2col_reshaped.allocator()->allocate();
+ _gemm_output.allocator()->allocate();
+}
+
+void CLLocallyConnectedLayer::run()
+{
+ // Run weights reshaping (Runs once for every configure)
+ if(_is_first_run)
+ {
+ _is_first_run = false;
+ CLScheduler::get().enqueue(_weights_reshape_kernel);
+ }
+
+ // Run input reshaping
+ CLScheduler::get().enqueue(_input_im2col_kernel);
+
+ // Runs vector matrix multiply on reshaped matrices
+ CLScheduler::get().enqueue(_mm_kernel);
+
+ // Reshape output matrix
+ CLScheduler::get().enqueue(_output_col2im_kernel, false);
+}