arm_compute v17.09
Change-Id: I4bf8f4e6e5f84ce0d5b6f5ba570d276879f42a81
diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
index 5097dd4..5613e6c 100644
--- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp
+++ b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLAbsoluteDifferenceKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLAbsoluteDifferenceKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
index 56c5199..78f25fc 100644
--- a/src/runtime/CL/functions/CLAccumulate.cpp
+++ b/src/runtime/CL/functions/CLAccumulate.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLAccumulate.h"
#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,21 +32,21 @@
void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum)
{
- auto k = arm_compute::cpp14::make_unique<CLAccumulateKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLAccumulateKernel>();
k->configure(input, accum);
_kernel = std::move(k);
}
void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
{
- auto k = arm_compute::cpp14::make_unique<CLAccumulateWeightedKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLAccumulateWeightedKernel>();
k->configure(input, alpha, accum);
_kernel = std::move(k);
}
void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
{
- auto k = arm_compute::cpp14::make_unique<CLAccumulateSquaredKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLAccumulateSquaredKernel>();
k->configure(input, shift, accum);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 9b5bd8b..b64739a 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -24,13 +24,13 @@
#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-void CLActivationLayer::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
{
- auto k = arm_compute::cpp14::make_unique<CLActivationLayerKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>();
k->configure(input, output, act_info);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
index 36bff42..5ca384d 100644
--- a/src/runtime/CL/functions/CLArithmeticAddition.cpp
+++ b/src/runtime/CL/functions/CLArithmeticAddition.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLArithmeticAddition::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
{
- auto k = arm_compute::cpp14::make_unique<CLArithmeticAdditionKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticAdditionKernel>();
k->configure(input1, input2, output, policy);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
index 97f0a1c..651f51a 100644
--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+++ b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
{
- auto k = arm_compute::cpp14::make_unique<CLArithmeticSubtractionKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionKernel>();
k->configure(input1, input2, output, policy);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index 3df673c..68cdaac 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -37,7 +37,7 @@
{
}
-void CLBatchNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
+void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
{
_norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
}
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index 7c85043..f8a5a85 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLBitwiseAndKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLBitwiseAndKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index 17ae5de..dc002e5 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLBitwiseNotKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLBitwiseNotKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index c84a279..4a10bb2 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLBitwiseOrKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLBitwiseOrKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index fd49c7d..d23622a 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLBitwiseXorKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLBitwiseXorKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
index 8de6807..f28be44 100644
--- a/src/runtime/CL/functions/CLBox3x3.cpp
+++ b/src/runtime/CL/functions/CLBox3x3.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLBox3x3.h"
#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,8 +33,8 @@
void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLBox3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLBox3x3Kernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 1d018b8..5acb8e7 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -26,17 +26,31 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-CLCannyEdge::CLCannyEdge()
- : _sobel(nullptr), _gradient(), _border_mag_gradient(), _non_max_suppr(), _edge_trace(), _gx(), _gy(), _mag(), _phase(), _nonmax(), _visited(), _recorded(), _l1_list_counter(), _l1_stack()
+CLCannyEdge::CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _sobel(),
+ _gradient(),
+ _border_mag_gradient(),
+ _non_max_suppr(),
+ _edge_trace(),
+ _gx(),
+ _gy(),
+ _mag(),
+ _phase(),
+ _nonmax(),
+ _visited(),
+ _recorded(),
+ _l1_list_counter(),
+ _l1_stack()
{
}
@@ -83,22 +97,26 @@
TensorInfo info_s32(shape_l1_stack, 1, arm_compute::DataType::S32);
_l1_stack.allocator()->init(info_s32);
+ // Manage intermediate buffers
+ _memory_group.manage(&_gx);
+ _memory_group.manage(&_gy);
+
// Configure/Init sobelNxN
if(gradient_size == 3)
{
- auto k = arm_compute::cpp14::make_unique<CLSobel3x3>();
+ auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
}
else if(gradient_size == 5)
{
- auto k = arm_compute::cpp14::make_unique<CLSobel5x5>();
+ auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
}
else if(gradient_size == 7)
{
- auto k = arm_compute::cpp14::make_unique<CLSobel7x7>();
+ auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
}
@@ -107,23 +125,43 @@
ARM_COMPUTE_ERROR("Gradient %d size not supported", gradient_size);
}
+ // Manage intermediate buffers
+ _memory_group.manage(&_mag);
+ _memory_group.manage(&_phase);
+
// Configure gradient
_gradient.configure(&_gx, &_gy, &_mag, &_phase, norm_type);
+ // Allocate intermediate buffers
+ _gx.allocator()->allocate();
+ _gy.allocator()->allocate();
+
+ // Manage intermediate buffers
+ _memory_group.manage(&_nonmax);
+
// Configure non-maxima suppression
_non_max_suppr.configure(&_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
+ // Allocate intermediate buffers
+ _phase.allocator()->allocate();
+
// Fill border around magnitude image as non-maxima suppression will access
// it. If border mode is undefined filling the border is a nop.
_border_mag_gradient.configure(&_mag, _non_max_suppr.border_size(), border_mode, constant_border_value);
+ // Allocate intermediate buffers
+ _mag.allocator()->allocate();
+
+ // Manage intermediate buffers
+ _memory_group.manage(&_visited);
+ _memory_group.manage(&_recorded);
+ _memory_group.manage(&_l1_stack);
+ _memory_group.manage(&_l1_list_counter);
+
// Configure edge tracing
_edge_trace.configure(&_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
- _gx.allocator()->allocate();
- _gy.allocator()->allocate();
- _phase.allocator()->allocate();
- _mag.allocator()->allocate();
+ // Allocate intermediate buffers
_visited.allocator()->allocate();
_recorded.allocator()->allocate();
_l1_stack.allocator()->allocate();
@@ -133,6 +171,8 @@
void CLCannyEdge::run()
{
+ _memory_group.acquire();
+
// Run sobel
_sobel->run();
@@ -152,4 +192,6 @@
_l1_list_counter.clear(CLScheduler::get().queue());
_l1_stack.clear(CLScheduler::get().queue());
CLScheduler::get().enqueue(_edge_trace, true);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
index 79a3676..11605cf 100644
--- a/src/runtime/CL/functions/CLChannelCombine.cpp
+++ b/src/runtime/CL/functions/CLChannelCombine.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,14 +32,14 @@
void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLChannelCombineKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>();
k->configure(plane0, plane1, plane2, plane3, output);
_kernel = std::move(k);
}
void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
{
- auto k = arm_compute::cpp14::make_unique<CLChannelCombineKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>();
k->configure(plane0, plane1, plane2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
index 2c6174b..5090382 100644
--- a/src/runtime/CL/functions/CLChannelExtract.cpp
+++ b/src/runtime/CL/functions/CLChannelExtract.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,14 +32,14 @@
void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLChannelExtractKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>();
k->configure(input, channel, output);
_kernel = std::move(k);
}
void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
{
- auto k = arm_compute::cpp14::make_unique<CLChannelExtractKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>();
k->configure(input, channel, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
index 2fe465a..65f8ac3 100644
--- a/src/runtime/CL/functions/CLColorConvert.cpp
+++ b/src/runtime/CL/functions/CLColorConvert.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLColorConvert.h"
#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,28 +32,28 @@
void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output)
{
- auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output)
{
- auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output)
{
- auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index 21b5d47..a9b0867 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -26,13 +26,13 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/ITensorAllocator.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -40,15 +40,15 @@
void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLConvolution3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLConvolution3x3Kernel>();
k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
}
template <unsigned int matrix_size>
-CLConvolutionSquare<matrix_size>::CLConvolutionSquare()
- : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+CLConvolutionSquare<matrix_size>::CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
{
}
@@ -66,6 +66,9 @@
std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size);
_tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp);
+
if(scale == 0)
{
scale = calculate_matrix_scale(conv, matrix_size);
@@ -92,8 +95,12 @@
if(_is_separable)
{
+ _memory_group.acquire();
+
CLScheduler::get().enqueue(_kernel_hor, false);
CLScheduler::get().enqueue(_kernel_vert);
+
+ _memory_group.release();
}
else
{
@@ -107,7 +114,7 @@
void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLConvolutionRectangleKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLConvolutionRectangleKernel>();
k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index f0bbc35..4b1bfd8 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -24,32 +24,31 @@
#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include <cmath>
+#include <memory>
#include <tuple>
using namespace arm_compute;
-CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
- : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
{
}
void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
if(biases != nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
@@ -65,10 +64,12 @@
const unsigned int mat_weights_cols = weights->info()->dimension(3);
const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
- const DataType dt = weights->info()->data_type();
- TensorInfo info_wr(shape_wr, 1, dt);
+ const DataType dt = weights->info()->data_type();
+ const int fixed_point_position = weights->info()->fixed_point_position();
+ TensorInfo info_wr(shape_wr, 1, dt, fixed_point_position);
_weights_reshaped.allocator()->init(info_wr);
+ _memory_group.manage(&_weights_reshaped);
_weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
_weights_transposed_kernel.configure(&_weights_reshaped, output);
_weights_reshaped.allocator()->allocate();
@@ -81,41 +82,50 @@
void CLConvolutionLayerReshapeWeights::run()
{
+ _memory_group.acquire();
+
cl::CommandQueue q = CLScheduler::get().queue();
CLScheduler::get().enqueue(_weights_reshape_kernel);
if(_transpose1xW)
{
CLScheduler::get().enqueue(_weights_transposed_kernel);
}
+
+ _memory_group.release();
}
-CLConvolutionLayer::CLConvolutionLayer()
- : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
- _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
+ _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
{
}
void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
if(biases != nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
}
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
+ // Set the GPU target for matrix multiply
+ _mm_kernel.set_target(CLScheduler::get().target());
+
_has_bias = (biases != nullptr);
_are_weights_reshaped = weights_info.are_reshaped();
- // Get parameters for conv_info
+ // Get parameters from conv_info
unsigned int stride_x = 0;
unsigned int stride_y = 0;
unsigned int pad_x = 0;
@@ -127,20 +137,21 @@
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- const unsigned int kernel_width = _are_weights_reshaped ? weights_info.kernel_size() : weights->info()->dimension(0);
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
- stride_x, stride_y, pad_x, pad_y, conv_info.round());
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+ const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0);
+ const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1);
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+ conv_info);
// Check if its a "fully connected" convolution
_is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
- // Create tensor to store the reshaped weights
- size_t mat_weights_cols = weights->info()->dimension(3);
- size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+ unsigned int mat_weights_cols = weights->info()->dimension(3);
+ unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+
+ // Reshape weights if needed
if(_are_weights_reshaped)
{
- mat_weights_cols = output->info()->dimension(2);
+ mat_weights_cols = weights_info.num_kernels();
const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
mat_weights_rows = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
}
@@ -150,77 +161,75 @@
{
// Create tensor to store the reshaped weights
TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
- TensorInfo info_wr(shape_wr, 1, weights->info()->data_type());
+ TensorInfo info_wr(shape_wr, 1, dt, fixed_point_position);
_weights_reshaped.allocator()->init(info_wr);
- _reshape_weights.configure(weights, biases, &_weights_reshaped, false);
- weights = &_weights_reshaped;
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
}
else
{
// Create tensor to store transposed weights
- TensorShape shape_wt(mat_weights_rows * 4, static_cast<size_t>(std::ceil(mat_weights_cols / 4.f)));
- TensorInfo info_wt(shape_wt, 1, weights->info()->data_type());
- _weights_transposed.allocator()->init(info_wt);
- _reshape_weights.configure(weights, biases, &_weights_transposed, true);
- weights = &_weights_transposed;
+ const float transpose_width = 16.0f / input->info()->element_size();
+ TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _weights_reshaped.allocator()->init(info_wt);
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
}
+ weights = &_weights_reshaped;
}
+
// Create tensor to store im2col reshaped inputs
- const size_t mat_input_cols = mat_weights_rows;
- const size_t mat_input_rows = conv_w * conv_h;
- TensorShape shape_im2col = input->info()->tensor_shape();
+ const unsigned int mat_input_cols = mat_weights_rows;
+ const unsigned int mat_input_rows = conv_w * conv_h;
+ TensorShape shape_im2col = input->info()->tensor_shape();
shape_im2col.set(0, mat_input_cols);
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
- _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+ _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+ _memory_group.manage(&_input_im2col_reshaped);
// Create tensor (interleave) to prepare input tensor for GEMM
if(!_is_fully_connected_convolution)
{
TensorShape shape_interleaved = shape_im2col;
shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4.f));
- _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+ shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+ _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+ _memory_group.manage(&_input_interleaved_reshaped);
}
// Create GEMM output tensor
TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, mat_input_rows);
- _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+ _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
+ _memory_group.manage(&_gemm_output);
// Configure kernels
- _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
- _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
+ // Configure matrix multiply
if(_is_fully_connected_convolution)
{
- _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+ // The matrix A and Matrix B have not been reshaped
+ _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f, false);
}
else
{
_input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
_mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
- }
-
- if(!_are_weights_reshaped)
- {
- if(!_is_fully_connected_convolution)
- {
- _weights_transposed.allocator()->allocate();
- }
- else
- {
- _weights_reshaped.allocator()->allocate();
- }
- }
-
- _input_im2col_reshaped.allocator()->allocate();
- if(!_is_fully_connected_convolution)
- {
_input_interleaved_reshaped.allocator()->allocate();
}
+ _input_im2col_reshaped.allocator()->allocate();
+ _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
_gemm_output.allocator()->allocate();
+
+ ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+ // Allocate intermediate tensor
+ if(!_are_weights_reshaped)
+ {
+ _weights_reshaped.allocator()->allocate();
+ }
}
void CLConvolutionLayer::run()
@@ -232,6 +241,8 @@
_reshape_weights.run();
}
+ _memory_group.acquire();
+
// Run input reshaping
CLScheduler::get().enqueue(_input_im2col_kernel);
if(!_is_fully_connected_convolution)
@@ -244,4 +255,6 @@
// Reshape output matrix
CLScheduler::get().enqueue(_output_col2im_kernel, false);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp
index d967d98..f42627f 100644
--- a/src/runtime/CL/functions/CLDepthConcatenate.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenate.cpp
@@ -24,22 +24,23 @@
#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-CLDepthConcatenate::CLDepthConcatenate()
- : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+CLDepthConcatenate::CLDepthConcatenate() // NOLINT
+ : _inputs_vector(),
+ _concat_kernels_vector(),
+ _border_handlers_vector(),
+ _num_inputs(0)
{
}
-void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output)
+void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
{
ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
@@ -47,8 +48,8 @@
unsigned int depth_offset = 0;
- _concat_kernels_vector = arm_compute::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
- _border_handlers_vector = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+ _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
+ _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
for(unsigned int i = 0; i < _num_inputs; i++)
{
diff --git a/src/runtime/CL/functions/CLDepthConvert.cpp b/src/runtime/CL/functions/CLDepthConvert.cpp
index edcd492..b64d05b 100644
--- a/src/runtime/CL/functions/CLDepthConvert.cpp
+++ b/src/runtime/CL/functions/CLDepthConvert.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLDepthConvert::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
{
- auto k = arm_compute::cpp14::make_unique<CLDepthConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertKernel>();
k->configure(input, output, policy, shift);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolution.cpp b/src/runtime/CL/functions/CLDepthwiseConvolution.cpp
new file mode 100644
index 0000000..22c037f
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthwiseConvolution.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDepthwiseConvolution3x3::CLDepthwiseConvolution3x3()
+ : _kernel(), _border_handler()
+{
+}
+
+void CLDepthwiseConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+ _kernel.configure(input, output, weights, conv_info);
+ _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+void CLDepthwiseConvolution3x3::run()
+{
+ CLScheduler::get().enqueue(_border_handler);
+ CLScheduler::get().enqueue(_kernel);
+}
+
+CLDepthwiseConvolution::CLDepthwiseConvolution()
+ : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), _weights_reshaped(),
+ _v2mm_output()
+{
+}
+
+void CLDepthwiseConvolution::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
+
+ const size_t weights_w = weights->info()->dimension(0);
+ const size_t weights_h = weights->info()->dimension(1);
+ const size_t weights_z = weights->info()->dimension(2);
+
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
+
+ // Set up intermediate tensors
+ const size_t patch_size = weights_w * weights_h;
+ const size_t conv_size = conv_w * conv_h;
+
+ TensorShape shape_im2col = input->info()->tensor_shape();
+ shape_im2col.set(0, patch_size);
+ shape_im2col.set(1, conv_size);
+ shape_im2col.set(2, weights_z);
+
+ const TensorShape shape_weights_reshape(patch_size, weights_z);
+ TensorShape shape_v2mm_out = output->info()->tensor_shape();
+ shape_v2mm_out.set(0, conv_size * weights_z);
+ shape_v2mm_out.set(1, 1);
+ shape_v2mm_out.set(2, 1);
+
+ const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
+ const TensorInfo info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+ const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ _input_reshaped.allocator()->init(info_im2col);
+ _weights_reshaped.allocator()->init(info_weights_reshape);
+ _v2mm_output.allocator()->init(info_v2mm_out);
+
+ // Configure kernels
+ _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info);
+ _weights_reshape_kernel.configure(weights, &_weights_reshaped);
+ _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+ _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
+
+ BorderSize border_size = _v2mm_kernel.border_size();
+ _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
+
+ border_size.bottom = 0;
+ _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
+
+ // Allocate intermediate tensors
+ _input_reshaped.allocator()->allocate();
+ _weights_reshaped.allocator()->allocate();
+ _v2mm_output.allocator()->allocate();
+}
+
+void CLDepthwiseConvolution::run()
+{
+ CLScheduler::get().enqueue(_im2col_kernel);
+
+ CLScheduler::get().enqueue(_weights_reshape_kernel);
+
+ CLScheduler::get().enqueue(_v2mm_input_fill_border);
+ CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+ CLScheduler::get().enqueue(_v2mm_kernel);
+
+ CLScheduler::get().enqueue(_vector_to_tensor_kernel);
+}
diff --git a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
new file mode 100644
index 0000000..c325b3e
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLDepthwiseSeparableConvolutionLayer::CLDepthwiseSeparableConvolutionLayer()
+ : _depthwise_conv(), _pointwise_conv()
+{
+}
+
+void CLDepthwiseSeparableConvolutionLayer::configure(ICLTensor *input, const ICLTensor *depthwise_weights, ICLTensor *depthwise_out, const ICLTensor *pointwise_weights, const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
+{
+ _depthwise_conv.configure(input, depthwise_out, depthwise_weights, depthwise_conv_info);
+ _pointwise_conv.configure(depthwise_out, pointwise_weights, biases, output, pointwise_conv_info);
+}
+
+void CLDepthwiseSeparableConvolutionLayer::run()
+{
+ _depthwise_conv.run();
+ _pointwise_conv.run();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
new file mode 100644
index 0000000..5559d42
--- /dev/null
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDequantizationLayer::CLDequantizationLayer()
+ : _dequantize_kernel()
+{
+}
+
+void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
+{
+ _dequantize_kernel.configure(input, output, min_max);
+}
+
+void CLDequantizationLayer::run()
+{
+ // Run dequantization kernel
+ CLScheduler::get().enqueue(_dequantize_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
index c51cb4c..ae49996 100644
--- a/src/runtime/CL/functions/CLDerivative.cpp
+++ b/src/runtime/CL/functions/CLDerivative.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLDerivative.h"
#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,8 +33,8 @@
void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLDerivativeKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLDerivativeKernel>();
k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
index 345f477..59c5ea5 100644
--- a/src/runtime/CL/functions/CLDilate.cpp
+++ b/src/runtime/CL/functions/CLDilate.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLDilate.h"
#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,8 +33,8 @@
void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLDilateKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLDilateKernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
new file mode 100644
index 0000000..6fafd9c
--- /dev/null
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDirectConvolutionLayer::CLDirectConvolutionLayer()
+ : _direct_conv_kernel(), _input_border_handler()
+{
+}
+
+void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+ // Set GPU target
+ _direct_conv_kernel.set_target(CLScheduler::get().target());
+
+ // Configure direct convolution
+ _direct_conv_kernel.configure(input, weights, biases, output, conv_info);
+
+ // Configure border handler
+ _input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+void CLDirectConvolutionLayer::run()
+{
+ // Run border handler
+ CLScheduler::get().enqueue(_input_border_handler, false);
+
+ // Run direct convolution
+ CLScheduler::get().enqueue(_direct_conv_kernel);
+}
diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
index b4c50e4..eb1f6e4 100644
--- a/src/runtime/CL/functions/CLErode.cpp
+++ b/src/runtime/CL/functions/CLErode.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLErode.h"
#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,8 +33,8 @@
void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLErodeKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLErodeKernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
index d2903fb..7a0dd09 100644
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ b/src/runtime/CL/functions/CLFastCorners.cpp
@@ -36,8 +36,9 @@
using namespace arm_compute;
-CLFastCorners::CLFastCorners()
- : _fast_corners_kernel(),
+CLFastCorners::CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)),
+ _fast_corners_kernel(),
_suppr_func(),
_copy_array_kernel(),
_output(),
@@ -70,6 +71,7 @@
const bool update_number = (nullptr != _num_corners);
+ _memory_group.manage(&_output);
_fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, border_mode);
if(!_non_max)
@@ -79,6 +81,7 @@
else
{
_suppr.allocator()->init(tensor_info);
+ _memory_group.manage(&_suppr);
_suppr_func.configure(&_output, &_suppr, border_mode);
_copy_array_kernel.configure(&_suppr, update_number, corners, &_num_buffer);
@@ -94,6 +97,8 @@
{
cl::CommandQueue q = CLScheduler::get().queue();
+ _memory_group.acquire();
+
if(_non_max)
{
ARM_COMPUTE_ERROR_ON_MSG(_output.cl_buffer().get() == nullptr, "Unconfigured function");
@@ -124,4 +129,6 @@
}
q.flush();
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
index 9e59b77..54c096e 100644
--- a/src/runtime/CL/functions/CLFillBorder.cpp
+++ b/src/runtime/CL/functions/CLFillBorder.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLFillBorder.h"
#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLFillBorderKernel>();
- k->configure(tensor, border_width, border_mode, constant_border_value);
+ auto k = arm_compute::support::cpp14::make_unique<CLFillBorderKernel>();
+ k->configure(tensor, BorderSize(border_width), border_mode, constant_border_value);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
new file mode 100644
index 0000000..9f571b2
--- /dev/null
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/Size2D.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLIm2ColKernel>();
+ k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+ _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
new file mode 100644
index 0000000..364db34
--- /dev/null
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFloor.h"
+
+#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLFloor::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLFloorKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 57d57d5..ee1558f 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -23,88 +23,31 @@
*/
#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
#include <algorithm>
-#include <cmath>
using namespace arm_compute;
-CLFullyConnectedLayerReshapeWeights::CLFullyConnectedLayerReshapeWeights()
- : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
+
+CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(),
+ _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
{
}
-void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer)
+void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON(output == nullptr);
- ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
- ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
-
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
-
- _transpose_weights = transpose_weights;
- _is_batched_fc_layer = is_batched_fc_layer;
-
- // Check if we need to transpose the weights
- if(_transpose_weights)
- {
- if(_is_batched_fc_layer)
- {
- // Initialize the output tensor for transpose
- TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
- _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
- _transpose_kernel.configure(input, &_transpose_output);
-
- // Configure transpose 1xW kernel
- _transpose1xW_kernel.configure(&_transpose_output, output);
-
- // Allocate temporary tensor used for transposing the weights
- _transpose_output.allocator()->allocate();
- }
- else
- {
- _transpose_kernel.configure(input, output);
- }
- }
- else
- {
- if(_is_batched_fc_layer)
- {
- // Configure transpose 1xW kernel
- _transpose1xW_kernel.configure(input, output);
- }
- else
- {
- ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
- }
- }
-}
-
-void CLFullyConnectedLayerReshapeWeights::run()
-{
- if(_transpose_weights)
- {
- CLScheduler::get().enqueue(_transpose_kernel, _is_batched_fc_layer);
- }
- if(_is_batched_fc_layer)
- {
- CLScheduler::get().enqueue(_transpose1xW_kernel);
- }
-}
-
-CLFullyConnectedLayer::CLFullyConnectedLayer()
- : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
- _are_weights_reshaped(true), _is_fc_after_conv(true), _is_batched_fc_layer(false), _accumulate_biases(false)
-{
-}
-
-void CLFullyConnectedLayer::configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+ ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
const DataType dt = input->info()->data_type();
const int fixed_point_position = input->info()->fixed_point_position();
@@ -119,93 +62,33 @@
shape_im2col.set(3, input->info()->dimension(5));
_im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
- // Initialize output tensor for interleave 4x4
- TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
-
// Configure im2col kernel
- _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
- // Configure interleave4x4 kernel
- _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
+ _memory_group.manage(&_im2col_output);
+ _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
// Configure matrix multiply kernel
- _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
-
- // Allocate the tensors once all the configure methods have been called
- _im2col_output.allocator()->allocate();
- _interleave4x4_output.allocator()->allocate();
-}
-
-void CLFullyConnectedLayer::configure_fc_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
-{
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
-
- // Initialize output tensor for interleave 4x4
- TensorShape shape_interleaved = input->info()->tensor_shape();
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
-
- // Configure interleave4x4 kernel
- _interleave4x4_kernel.configure(input, &_interleave4x4_output);
-
- // Configure matrix multiply kernel
- _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
-
- // Allocate the tensors once all the configure methods have been called
- _interleave4x4_output.allocator()->allocate();
-}
-
-void CLFullyConnectedLayer::configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
-
- // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
- // Initialize output tensor for im2col
- TensorShape shape_im2col;
- shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
- shape_im2col.set(1, 1);
- _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
-
- // Configure im2col kernel
- _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
- // Configure matrix multiply kernel
- _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
+ _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false);
// Allocate the output tensor for im2col once all the configure methods have been called
_im2col_output.allocator()->allocate();
}
-void CLFullyConnectedLayer::configure_fc_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
// Configure matrix multiply kernel
- _mm_kernel.configure(input, weights, output, 1.0f);
+ _mm_kernel.configure(input, weights, output, 1.0f, false);
}
void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
-
- _are_weights_reshaped = are_weights_reshaped;
+ _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true;
_is_fc_after_conv = true;
- _is_batched_fc_layer = false;
_accumulate_biases = false;
if(biases != nullptr)
@@ -224,90 +107,46 @@
// 3) Convolution layer -> Fully Connected layer with batches
// 4) Fully Connected layer -> Fully Connected layer with batches
- // Check if we have a fully connected layer with batches
- _is_batched_fc_layer = (output->info()->dimension(1) > 1);
-
const ICLTensor *weights_to_use = weights;
- if(!are_weights_reshaped)
+ if(!_are_weights_reshaped)
{
- if((transpose_weights || _is_batched_fc_layer))
- {
- weights_to_use = &_reshape_weights_output;
+ weights_to_use = &_reshape_weights_output;
- if(transpose_weights)
- {
- if(_is_batched_fc_layer)
- {
- const float transpose_width = 16.0f / input->info()->element_size();
- TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
- TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
- _reshape_weights_output.allocator()->init(info_wt);
- }
- else
- {
- TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
- TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
- _reshape_weights_output.allocator()->init(info_wt);
- }
- }
- else
- {
- ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
-
- const float transpose_width = 16.0f / input->info()->element_size();
- TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
- TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
- _reshape_weights_output.allocator()->init(info_wt);
- }
-
- // Reshape the weights
- _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
- }
+ // Reshape the weights
+ _reshape_weights_kernel.configure(weights, &_reshape_weights_output);
}
- if(_is_batched_fc_layer)
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+
+ if(is_batched_fc_layer)
{
_is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
input->info()->tensor_shape().cend(),
output->info()->tensor_shape().cbegin() + 1));
-
- if(_is_fc_after_conv)
- {
- // Fully Connected layer after a Convolution Layer with batches
- configure_conv_fc_wb(input, weights_to_use, output);
- }
- else
- {
- // Fully Connected layer after a Fully Connected Layer with batches
- configure_fc_fc_wb(input, weights_to_use, output);
- }
}
else
{
- // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
- _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+ _is_fc_after_conv = input->info()->num_dimensions() > 1;
+ }
- if(_is_fc_after_conv)
- {
- // Fully Connected layer after a Convolution Layer without batches
- configure_conv_fc_nb(input, weights_to_use, output);
- }
- else
- {
- // Fully Connected layer after a Fully Connected Layer without batches
- configure_fc_fc_nb(input, weights_to_use, output);
- }
+ if(_is_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer without batches
+ configure_conv_fc(input, weights_to_use, output);
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer without batches
+ configure_fc_fc(input, weights_to_use, output);
}
// Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
- if(!are_weights_reshaped)
+ if(!_are_weights_reshaped)
{
- if(transpose_weights || _is_batched_fc_layer)
- {
- // Allocate the tensor for the weights reshaped
- _reshape_weights_output.allocator()->allocate();
- }
+ // Allocate the tensor for the weights reshaped
+ _reshape_weights_output.allocator()->allocate();
}
}
@@ -320,18 +159,14 @@
_reshape_weights_kernel.run();
}
+ _memory_group.acquire();
+
// Linearize input if it comes from a convolutional layer
if(_is_fc_after_conv)
{
CLScheduler::get().enqueue(_im2col_kernel, false);
}
- // Interleave input
- if(_is_batched_fc_layer)
- {
- CLScheduler::get().enqueue(_interleave4x4_kernel, false);
- }
-
// Run matrix multiply
CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
@@ -340,4 +175,6 @@
{
CLScheduler::get().enqueue(_accumulate_biases_kernel);
}
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 7408054..a81d113 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -38,20 +38,18 @@
using namespace arm_compute;
-CLGEMM::CLGEMM()
- : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
{
}
void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
if(c != nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
@@ -59,13 +57,18 @@
ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
}
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
- if(a->info()->dimension(1) != 1)
+ // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
+ _is_interleaved_transposed = a->info()->dimension(1) > 16;
+
+ const ICLTensor *matrix_a = a;
+ const ICLTensor *matrix_b = b;
+
+ if(_is_interleaved_transposed)
{
- _run_vector_matrix_multiplication = false;
+ matrix_a = &_tmp_a;
+ matrix_b = &_tmp_b;
TensorShape shape_tmp_a = a->info()->tensor_shape();
TensorShape shape_tmp_b = b->info()->tensor_shape();
@@ -73,27 +76,20 @@
shape_tmp_a.set(0, a->info()->dimension(0) * 4);
shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
- if(DataType::F32 == a->info()->data_type())
- {
- shape_tmp_b.set(0, b->info()->dimension(1) * 4);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.0f));
- }
- else if(DataType::F16 == a->info()->data_type())
- {
- shape_tmp_b.set(0, b->info()->dimension(1) * 8);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 8.0f));
- }
- else
- {
- ARM_COMPUTE_ERROR("DataType not supported");
- }
+ const unsigned int transpose_w = max_cl_vector_width / data_size_from_type(b->info()->data_type());
+ shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
+ shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+ TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
_tmp_a.allocator()->init(info_a);
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+ TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), b->info()->fixed_point_position());
_tmp_b.allocator()->init(info_b);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp_a);
+ _memory_group.manage(&_tmp_b);
+
// Configure interleave kernel
_interleave_kernel.configure(a, &_tmp_a);
@@ -101,19 +97,17 @@
_transpose_kernel.configure(b, &_tmp_b);
// Configure matrix multiply kernel
- _mm_kernel.configure(&_tmp_a, &_tmp_b, output, alpha);
+ _mm_kernel.set_target(CLScheduler::get().target());
+ }
+ _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
+
+ if(_is_interleaved_transposed)
+ {
// Allocate intermediate tensors
_tmp_a.allocator()->allocate();
_tmp_b.allocator()->allocate();
}
- else // The first input tensor is a vector
- {
- _run_vector_matrix_multiplication = true;
-
- // Configure the matrix multiply kernel
- _mm_kernel.configure(a, b, output, alpha);
- }
// Configure matrix addition kernel
if(beta != 0 && c != nullptr)
@@ -125,7 +119,9 @@
void CLGEMM::run()
{
- if(!_run_vector_matrix_multiplication)
+ _memory_group.acquire();
+
+ if(_is_interleaved_transposed)
{
// Run interleave kernel
CLScheduler::get().enqueue(_interleave_kernel, false);
@@ -142,4 +138,6 @@
{
CLScheduler::get().enqueue(_ma_kernel);
}
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
index 9dc7715..45547e4 100644
--- a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
+++ b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
@@ -24,13 +24,13 @@
#include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
void CLGEMMInterleave4x4::configure(const ICLTensor *input, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLGEMMInterleave4x4Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMInterleave4x4Kernel>();
k->configure(input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp
index 45e011d..db6d11c 100644
--- a/src/runtime/CL/functions/CLGEMMLowp.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowp.cpp
@@ -33,8 +33,8 @@
using namespace arm_compute;
-CLGEMMLowp::CLGEMMLowp()
- : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+CLGEMMLowp::CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
{
}
@@ -62,6 +62,10 @@
TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
_tmp_b.allocator()->init(info_b);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp_a);
+ _memory_group.manage(&_tmp_b);
+
// Configure kernels
_interleave_kernel.configure(a, &_tmp_a);
_transpose_kernel.configure(b, &_tmp_b);
@@ -74,6 +78,8 @@
void CLGEMMLowp::run()
{
+ _memory_group.acquire();
+
/* Run interleave kernel */
CLScheduler::get().enqueue(_interleave_kernel, false);
@@ -82,4 +88,6 @@
/* Run matrix multiply kernel */
CLScheduler::get().enqueue(_mm_kernel, false);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp b/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp
new file mode 100644
index 0000000..d054e01
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLGEMMTranspose1xW::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMTranspose1xWKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
index 362a3fe..7ebabd7 100644
--- a/src/runtime/CL/functions/CLGaussian3x3.cpp
+++ b/src/runtime/CL/functions/CLGaussian3x3.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLGaussian3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLGaussian3x3Kernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
index e83a8fb..f30eee1 100644
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp
@@ -35,8 +35,8 @@
using namespace arm_compute;
-CLGaussian5x5::CLGaussian5x5()
- : _kernel_hor(), _kernel_vert(), _border_handler(), _tmp()
+CLGaussian5x5::CLGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _border_handler(), _tmp()
{
}
@@ -46,6 +46,10 @@
_tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, DataType::U16));
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp);
+
+ // Configure kernels
_kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
_kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
_border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
@@ -57,6 +61,11 @@
void CLGaussian5x5::run()
{
CLScheduler::get().enqueue(_border_handler, false);
+
+ _memory_group.acquire();
+
CLScheduler::get().enqueue(_kernel_hor, false);
CLScheduler::get().enqueue(_kernel_vert);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index 8a4279e..8436dce 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -27,11 +27,11 @@
#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
#include "arm_compute/runtime/CL/CLPyramid.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
@@ -48,8 +48,10 @@
{
}
-CLGaussianPyramidHalf::CLGaussianPyramidHalf()
- : _border_handler(), _horizontal_reduction(), _vertical_reduction()
+CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
+ : _border_handler(),
+ _horizontal_reduction(),
+ _vertical_reduction()
{
}
@@ -70,9 +72,9 @@
if(num_levels > 1)
{
- _border_handler = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
- _horizontal_reduction = arm_compute::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
- _vertical_reduction = arm_compute::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+ _border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+ _horizontal_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
+ _vertical_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
// Apply half scale to the X dimension of the tensor shape
TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -119,8 +121,9 @@
}
}
-CLGaussianPyramidOrb::CLGaussianPyramidOrb()
- : _gauss5x5(), _scale_nearest()
+CLGaussianPyramidOrb::CLGaussianPyramidOrb() // NOLINT
+ : _gauss5x5(),
+ _scale_nearest()
{
}
@@ -141,8 +144,8 @@
if(num_levels > 1)
{
- _gauss5x5 = arm_compute::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
- _scale_nearest = arm_compute::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
+ _gauss5x5 = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
+ _scale_nearest = arm_compute::support::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index b1b5a03..1470d5c 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -31,8 +31,8 @@
using namespace arm_compute;
-CLHOGDescriptor::CLHOGDescriptor()
- : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
{
}
@@ -71,9 +71,16 @@
TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
_hog_space.allocator()->init(info_space);
+ // Manage intermediate buffers
+ _memory_group.manage(&_mag);
+ _memory_group.manage(&_phase);
+
// Initialise gradient kernel
_gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+ // Manage intermediate buffers
+ _memory_group.manage(&_hog_space);
+
// Initialise orientation binning kernel
_orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
@@ -88,6 +95,8 @@
void CLHOGDescriptor::run()
{
+ _memory_group.acquire();
+
// Run gradient
_gradient.run();
@@ -96,4 +105,6 @@
// Run block normalization
CLScheduler::get().enqueue(_block_norm);
+
+ _memory_group.release();
}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index 2387474..51aeaed 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -29,8 +29,8 @@
using namespace arm_compute;
-CLHOGGradient::CLHOGGradient()
- : _derivative(), _mag_phase(), _gx(), _gy()
+CLHOGGradient::CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _derivative(), _mag_phase(), _gx(), _gy()
{
}
@@ -47,6 +47,10 @@
_gx.allocator()->init(info);
_gy.allocator()->init(info);
+ // Manage intermediate buffers
+ _memory_group.manage(&_gx);
+ _memory_group.manage(&_gy);
+
// Initialise derivate kernel
_derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
@@ -67,9 +71,13 @@
void CLHOGGradient::run()
{
+ _memory_group.acquire();
+
// Run derivative
_derivative.run();
// Run magnitude/phase kernel
CLScheduler::get().enqueue(_mag_phase);
+
+ _memory_group.release();
}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index b8f2224..8012c2f 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -25,17 +25,31 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/runtime/CL/CLArray.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/Scheduler.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-CLHOGMultiDetection::CLHOGMultiDetection()
- : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
- _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _gradient_kernel(),
+ _orient_bin_kernel(),
+ _block_norm_kernel(),
+ _hog_detect_kernel(),
+ _non_maxima_kernel(),
+ _hog_space(),
+ _hog_norm_space(),
+ _detection_windows(),
+ _mag(),
+ _phase(),
+ _non_maxima_suppression(false),
+ _num_orient_bin_kernel(0),
+ _num_block_norm_kernel(0),
+ _num_hog_detect_kernel(0)
{
}
@@ -114,12 +128,12 @@
_num_block_norm_kernel = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
_num_hog_detect_kernel = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
- _orient_bin_kernel = arm_compute::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
- _block_norm_kernel = arm_compute::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
- _hog_detect_kernel = arm_compute::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
- _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
- _hog_space = arm_compute::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
- _hog_norm_space = arm_compute::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
+ _orient_bin_kernel = arm_compute::support::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+ _block_norm_kernel = arm_compute::support::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+ _hog_detect_kernel = arm_compute::support::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
+ _non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+ _hog_space = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
+ _hog_norm_space = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
// Allocate tensors for magnitude and phase
TensorInfo info_mag(shape_img, Format::S16);
@@ -128,6 +142,10 @@
TensorInfo info_phase(shape_img, Format::U8);
_phase.allocator()->init(info_phase);
+ // Manage intermediate buffers
+ _memory_group.manage(&_mag);
+ _memory_group.manage(&_phase);
+
// Initialise gradient kernel
_gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
@@ -153,10 +171,17 @@
TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
_hog_space[i].allocator()->init(info_space);
+ // Manage intermediate buffers
+ _memory_group.manage(_hog_space.get() + i);
+
// Initialise orientation binning kernel
_orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
}
+ // Allocate intermediate tensors
+ _mag.allocator()->allocate();
+ _phase.allocator()->allocate();
+
// Configure CLTensor for the normalized HOG space and block normalization kernel
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
{
@@ -167,10 +192,19 @@
TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
_hog_norm_space[i].allocator()->init(tensor_info);
+ // Manage intermediate buffers
+ _memory_group.manage(_hog_norm_space.get() + i);
+
// Initialize block normalization kernel
_block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
}
+ // Allocate intermediate tensors
+ for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+ {
+ _hog_space[i].allocator()->allocate();
+ }
+
detection_window_strides->map(CLScheduler::get().queue(), true);
// Configure HOG detector kernel
@@ -187,14 +221,6 @@
_non_maxima_kernel->configure(_detection_windows, min_distance);
// Allocate intermediate tensors
- _mag.allocator()->allocate();
- _phase.allocator()->allocate();
-
- for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
- {
- _hog_space[i].allocator()->allocate();
- }
-
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
{
_hog_norm_space[i].allocator()->allocate();
@@ -205,6 +231,8 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
+ _memory_group.acquire();
+
// Reset detection window
_detection_windows->clear();
@@ -234,7 +262,9 @@
{
// Map detection windows array before computing non maxima suppression
_detection_windows->map(CLScheduler::get().queue(), true);
- _non_maxima_kernel->run(_non_maxima_kernel->window());
+ Scheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
_detection_windows->unmap(CLScheduler::get().queue());
}
-}
\ No newline at end of file
+
+ _memory_group.release();
+}
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 2db277f..059528f 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -27,7 +27,6 @@
#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
@@ -36,14 +35,28 @@
#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
#include "arm_compute/runtime/ITensorAllocator.h"
#include "arm_compute/runtime/Scheduler.h"
+#include "support/ToolchainSupport.h"
#include <cmath>
#include <utility>
using namespace arm_compute;
-CLHarrisCorners::CLHarrisCorners()
- : _sobel(), _harris_score(), _non_max_suppr(), _candidates(), _sort_euclidean(), _border_gx(), _border_gy(), _gx(), _gy(), _score(), _nonmax(), _corners_list(), _num_corner_candidates(0),
+CLHarrisCorners::CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _sobel(nullptr),
+ _harris_score(),
+ _non_max_suppr(),
+ _candidates(),
+ _sort_euclidean(),
+ _border_gx(),
+ _border_gy(),
+ _gx(),
+ _gy(),
+ _score(),
+ _nonmax(),
+ _corners_list(nullptr),
+ _num_corner_candidates(0),
_corners(nullptr)
{
}
@@ -62,6 +75,7 @@
const TensorShape shape = input->info()->tensor_shape();
const DataType dt = (gradient_size < 7) ? DataType::S16 : DataType::S32;
TensorInfo tensor_info(shape, 1, dt);
+
_gx.allocator()->init(tensor_info);
_gy.allocator()->init(tensor_info);
@@ -69,28 +83,32 @@
_score.allocator()->init(info_f32);
_nonmax.allocator()->init(info_f32);
- _corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+ _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+
+ // Manage intermediate buffers
+ _memory_group.manage(&_gx);
+ _memory_group.manage(&_gy);
/* Set/init Sobel kernel accordingly with gradient_size */
switch(gradient_size)
{
case 3:
{
- auto k = arm_compute::cpp14::make_unique<CLSobel3x3>();
+ auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
break;
}
case 5:
{
- auto k = arm_compute::cpp14::make_unique<CLSobel5x5>();
+ auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
break;
}
case 7:
{
- auto k = arm_compute::cpp14::make_unique<CLSobel7x7>();
+ auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>();
k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
break;
@@ -99,37 +117,49 @@
ARM_COMPUTE_ERROR("Gradient size not implemented");
}
- // Configure border filling before harris score
- _border_gx.configure(&_gx, block_size / 2, border_mode, constant_border_value);
- _border_gy.configure(&_gy, block_size / 2, border_mode, constant_border_value);
-
// Normalization factor
const float norm_factor = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
const float pow4_normalization_factor = pow(norm_factor, 4);
+ // Manage intermediate buffers
+ _memory_group.manage(&_score);
+
// Set/init Harris Score kernel accordingly with block_size
_harris_score.configure(&_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
- // Init non-maxima suppression function
- _non_max_suppr.configure(&_score, &_nonmax, border_mode == BorderMode::UNDEFINED);
-
- // Init corner candidates kernel
- _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
-
- // Init euclidean distance
- _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
+ // Configure border filling using harris score kernel's block size
+ _border_gx.configure(&_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
+ _border_gy.configure(&_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
// Allocate intermediate buffers
_gx.allocator()->allocate();
_gy.allocator()->allocate();
+
+ // Manage intermediate buffers
+ _memory_group.manage(&_nonmax);
+
+ // Init non-maxima suppression function
+ _non_max_suppr.configure(&_score, &_nonmax, border_mode);
+
+ // Allocate intermediate buffers
_score.allocator()->allocate();
+
+ // Init corner candidates kernel
+ _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+
+ // Allocate intermediate buffers
_nonmax.allocator()->allocate();
+
+ // Init euclidean distance
+ _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
}
void CLHarrisCorners::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
+ _memory_group.acquire();
+
// Init to 0 number of corner candidates
_num_corner_candidates = 0;
@@ -144,7 +174,7 @@
CLScheduler::get().enqueue(_harris_score, false);
// Run non-maxima suppression
- CLScheduler::get().enqueue(_non_max_suppr);
+ _non_max_suppr.run();
// Run corner candidate kernel
_nonmax.map(true);
@@ -152,6 +182,8 @@
_nonmax.unmap();
_corners->map(CLScheduler::get().queue(), true);
- _sort_euclidean.run(_sort_euclidean.window());
+ Scheduler::get().schedule(&_sort_euclidean, Window::DimY);
_corners->unmap(CLScheduler::get().queue());
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLL2Normalize.cpp b/src/runtime/CL/functions/CLL2Normalize.cpp
new file mode 100644
index 0000000..99be8ca
--- /dev/null
+++ b/src/runtime/CL/functions/CLL2Normalize.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLL2Normalize.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLL2Normalize::CLL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
+{
+}
+
+void CLL2Normalize::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon)
+{
+ // Manage intermediate buffers
+ _memory_group.manage(&_sumsq);
+
+ // Configure kernels
+ _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
+ _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
+
+ // Allocate intermediate tensor
+ _sumsq.allocator()->allocate();
+}
+
+void CLL2Normalize::run()
+{
+ _memory_group.acquire();
+
+ _reduce_func.run();
+ CLScheduler::get().enqueue(_normalize_kernel, true);
+
+ _memory_group.release();
+}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index d7ce206..a395487 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -24,7 +24,6 @@
#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IPyramid.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
@@ -33,11 +32,18 @@
#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-CLLaplacianPyramid::CLLaplacianPyramid()
- : _num_levels(0), _gaussian_pyr_function(), _convf(), _subf(), _depth_function(), _gauss_pyr(), _conv_pyr()
+CLLaplacianPyramid::CLLaplacianPyramid() // NOLINT
+ : _num_levels(0),
+ _gaussian_pyr_function(),
+ _convf(),
+ _subf(),
+ _depth_function(),
+ _gauss_pyr(),
+ _conv_pyr()
{
}
@@ -64,8 +70,8 @@
// Create Gaussian Pyramid function
_gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
- _convf = arm_compute::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
- _subf = arm_compute::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
+ _convf = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
+ _subf = arm_compute::support::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
for(unsigned int i = 0; i < _num_levels; ++i)
{
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
index 1dfab74..678848b 100644
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
@@ -24,18 +24,21 @@
#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IPyramid.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
#include <cstddef>
using namespace arm_compute;
-CLLaplacianReconstruct::CLLaplacianReconstruct()
- : _tmp_pyr(), _addf(), _scalef(), _depthf()
+CLLaplacianReconstruct::CLLaplacianReconstruct() // NOLINT
+ : _tmp_pyr(),
+ _addf(),
+ _scalef(),
+ _depthf()
{
}
@@ -60,8 +63,8 @@
_tmp_pyr.init(pyramid_info);
// Allocate add and scale functions. Level 0 does not need to be scaled.
- _addf = arm_compute::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
- _scalef = arm_compute::cpp14::make_unique<CLScale[]>(num_levels - 1);
+ _addf = arm_compute::support::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
+ _scalef = arm_compute::support::cpp14::make_unique<CLScale[]>(num_levels - 1);
const size_t last_level = num_levels - 1;
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 263fb51..a89a45a 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -33,8 +33,9 @@
using namespace arm_compute;
-CLLocallyConnectedLayer::CLLocallyConnectedLayer()
- : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+ _is_first_run(false)
{
}
@@ -68,8 +69,8 @@
// Get convolved dimensions
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
- stride_x, stride_y, pad_x, pad_y, conv_info.round());
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+ conv_info);
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
@@ -99,8 +100,12 @@
shape_gemm.set(1, mat_input_rows);
_gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+ // Manage intermediate buffers
+ _memory_group.manage(&_input_im2col_reshaped);
+ _memory_group.manage(&_gemm_output);
+
// Configure kernels
- _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(conv_w, conv_h), conv_info, _has_bias);
_weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
_mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
_output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
@@ -120,6 +125,8 @@
CLScheduler::get().enqueue(_weights_reshape_kernel);
}
+ _memory_group.acquire();
+
// Run input reshaping
CLScheduler::get().enqueue(_input_im2col_kernel);
@@ -128,4 +135,6 @@
// Reshape output matrix
CLScheduler::get().enqueue(_output_col2im_kernel, false);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
index 51088cb..68b8c35 100644
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ b/src/runtime/CL/functions/CLMagnitude.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLMagnitude.h"
#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
{
- auto k = arm_compute::cpp14::make_unique<CLMagnitudePhaseKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
k->configure(input1, input2, output, nullptr, mag_type);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
index 56ba146..838f7e7 100644
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp
@@ -23,19 +23,19 @@
*/
#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
-#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
using namespace arm_compute;
CLMeanStdDev::CLMeanStdDev()
: _mean_stddev_kernel(),
+ _fill_border_kernel(),
_global_sum(),
_global_sum_squared()
{
}
-void CLMeanStdDev::configure(const ICLImage *input, float *mean, float *stddev)
+void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev)
{
_global_sum = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
@@ -45,9 +45,11 @@
}
_mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+ _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
}
void CLMeanStdDev::run()
{
+ CLScheduler::get().enqueue(_fill_border_kernel);
CLScheduler::get().enqueue(_mean_stddev_kernel);
}
diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
index 0c10f9a..55f9eaa 100644
--- a/src/runtime/CL/functions/CLMedian3x3.cpp
+++ b/src/runtime/CL/functions/CLMedian3x3.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLMedian3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLMedian3x3Kernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
index ad783d8..49dcbcb 100644
--- a/src/runtime/CL/functions/CLMinMaxLocation.cpp
+++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp
@@ -25,8 +25,8 @@
#include "arm_compute/core/CL/CLHelpers.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
CLMinMaxLocation::CLMinMaxLocation()
: _min_max_kernel(),
_min_max_loc_kernel(),
@@ -41,7 +41,7 @@
{
}
-void CLMinMaxLocation::configure(const ICLImage *input, int32_t *min, int32_t *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
+void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
{
ARM_COMPUTE_ERROR_ON(nullptr == min);
ARM_COMPUTE_ERROR_ON(nullptr == max);
@@ -67,8 +67,8 @@
CLScheduler::get().enqueue(_min_max_loc_kernel, false);
// Update min and max
- q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), _min);
- q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), _max);
+ q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_min));
+ q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_max));
// Update min and max count
if(_min_count != nullptr)
@@ -96,3 +96,4 @@
_max_loc->resize(max_corner_size);
}
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
index b593a6c..d37412f 100644
--- a/src/runtime/CL/functions/CLNonLinearFilter.cpp
+++ b/src/runtime/CL/functions/CLNonLinearFilter.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLNonLinearFilterKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLNonLinearFilterKernel>();
k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
index ca7d5ae..c0a0cef 100644
--- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
+++ b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode)
{
- auto k = arm_compute::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>();
k->configure(input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 2d89ebd..f4bd494 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -33,28 +33,26 @@
using namespace arm_compute;
CLNormalizationLayer::CLNormalizationLayer()
- : _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
+ : _norm_kernel(), _border_handler()
{
}
-void CLNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_ERROR_ON(input == nullptr);
- _squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type()));
+ // Configure normalization kernel
+ _norm_kernel.configure(input, output, norm_info);
- _norm_kernel.configure(input, &_squared_input, output, norm_info);
- _multiply_kernel.configure(input, input, &_squared_input, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
// Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
- _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
-
- // Allocate intermediate buffers
- _squared_input.allocator()->allocate();
+ _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
}
void CLNormalizationLayer::run()
{
- CLScheduler::get().enqueue(_multiply_kernel, false);
+ // Run border handler
CLScheduler::get().enqueue(_border_handler, false);
- CLScheduler::get().enqueue(_norm_kernel, false);
+
+ // Run normalization kernel
+ CLScheduler::get().enqueue(_norm_kernel);
}
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index a6b0eb3..d00b1b5 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -26,7 +26,6 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/runtime/CL/CLPyramid.h"
@@ -34,12 +33,27 @@
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/CLTensorAllocator.h"
#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-CLOpticalFlow::CLOpticalFlow()
- : _tracker_init_kernel(), _tracker_stage0_kernel(), _tracker_stage1_kernel(), _tracker_finalize_kernel(), _func_scharr(), _scharr_gx(), _scharr_gy(), _old_points(nullptr),
- _new_points_estimates(nullptr), _new_points(nullptr), _old_points_internal(), _new_points_internal(), _coefficient_table(), _old_values(), _num_levels(0)
+CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _tracker_init_kernel(),
+ _tracker_stage0_kernel(),
+ _tracker_stage1_kernel(),
+ _tracker_finalize_kernel(),
+ _func_scharr(),
+ _scharr_gx(),
+ _scharr_gy(),
+ _old_points(nullptr),
+ _new_points_estimates(nullptr),
+ _new_points(nullptr),
+ _old_points_internal(),
+ _new_points_internal(),
+ _coefficient_table(),
+ _old_values(),
+ _num_levels(0)
{
}
@@ -70,21 +84,21 @@
const int old_values_list_length = list_length * window_dimension * window_dimension;
// Create kernels and tensors
- _tracker_init_kernel = arm_compute::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
- _tracker_stage0_kernel = arm_compute::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
- _tracker_stage1_kernel = arm_compute::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
- _func_scharr = arm_compute::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
- _scharr_gx = arm_compute::cpp14::make_unique<CLTensor[]>(_num_levels);
- _scharr_gy = arm_compute::cpp14::make_unique<CLTensor[]>(_num_levels);
+ _tracker_init_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
+ _tracker_stage0_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
+ _tracker_stage1_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
+ _func_scharr = arm_compute::support::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
+ _scharr_gx = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
+ _scharr_gy = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
// Create internal keypoint arrays
- _old_points_internal = arm_compute::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
+ _old_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
_old_points_internal->resize(list_length);
- _new_points_internal = arm_compute::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
+ _new_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
_new_points_internal->resize(list_length);
- _coefficient_table = arm_compute::cpp14::make_unique<CLCoefficientTableArray>(list_length);
+ _coefficient_table = arm_compute::support::cpp14::make_unique<CLCoefficientTableArray>(list_length);
_coefficient_table->resize(list_length);
- _old_values = arm_compute::cpp14::make_unique<CLOldValueArray>(old_values_list_length);
+ _old_values = arm_compute::support::cpp14::make_unique<CLOldValueArray>(old_values_list_length);
_old_values->resize(old_values_list_length);
_new_points->resize(list_length);
@@ -103,6 +117,10 @@
_scharr_gx[i].allocator()->init(tensor_info);
_scharr_gy[i].allocator()->init(tensor_info);
+ // Manage intermediate buffers
+ _memory_group.manage(_scharr_gx.get() + i);
+ _memory_group.manage(_scharr_gy.get() + i);
+
// Init Scharr kernel
_func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
@@ -131,6 +149,8 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
+ _memory_group.acquire();
+
for(unsigned int level = _num_levels; level > 0; --level)
{
// Run Scharr kernel
@@ -147,4 +167,6 @@
}
CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
index a8cb22b..cf3fa7e 100644
--- a/src/runtime/CL/functions/CLPhase.cpp
+++ b/src/runtime/CL/functions/CLPhase.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLPhase.h"
#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
{
- auto k = arm_compute::cpp14::make_unique<CLMagnitudePhaseKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
k->configure(input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 8a86c2e..139d466 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLPixelWiseMultiplication::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
- auto k = arm_compute::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 1ef70f4..2cb7d63 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -24,14 +24,14 @@
#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
{
// Configure pooling kernel
- auto k = arm_compute::cpp14::make_unique<CLPoolingLayerKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLPoolingLayerKernel>();
k->configure(input, output, pool_info);
_kernel = std::move(k);
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
new file mode 100644
index 0000000..ed1f51c
--- /dev/null
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLQuantizationLayer::CLQuantizationLayer()
+ : _quantize_kernel(), _min_max_kernel(), _min_max()
+{
+}
+
+void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel.
+ _min_max_kernel.configure(input, &_min_max);
+
+ // Configure quantize kernel
+ _quantize_kernel.configure(input, output, &_min_max);
+
+ // Allocate min_max tensor
+ _min_max.allocator()->allocate();
+}
+
+void CLQuantizationLayer::run()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ // Reset min and max
+ _min_max_kernel.reset(q);
+
+ // Run min-max kernel
+ CLScheduler::get().enqueue(_min_max_kernel, false);
+
+ // Run quantize kernel
+ CLScheduler::get().enqueue(_quantize_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
new file mode 100644
index 0000000..0f480ee
--- /dev/null
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
+
+#include "arm_compute/core/CL/ICLArray.h"
+
+#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+ // Configure ROI pooling kernel
+ auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>();
+ k->configure(input, rois, output, pool_info);
+ _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
new file mode 100644
index 0000000..d02afb4
--- /dev/null
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
+{
+}
+
+void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+{
+ // Calculate number of WGs. 16 elements per thread, 8 threads per WG
+ unsigned int num_of_wg = ceil(input->info()->dimension(0) / 128.f);
+
+ // Calculate number of stages. First stage performs op and the rest reduction sum
+ // depending on the size of the input. Last stage should have only 1 WG.
+ _num_of_stages = num_of_wg / 128 + 2;
+
+ // Create temporary tensors
+ _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+
+ // Configure reduction operation kernels
+ _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
+ _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
+
+ TensorShape shape{ input->info()->tensor_shape() };
+ for(unsigned int i = 0; i < _num_of_stages - 1; i++)
+ {
+ shape.set(0, ceil(shape.x() / 128.f));
+ _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+ }
+
+ // Apply ReductionOperation only on first kernel
+ _memory_group.manage(_sums_vector.get());
+ _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, op);
+ _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+ // Apply ReductionOperation on intermediate stages
+ for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
+ {
+ _memory_group.manage(_sums_vector.get() + i);
+ _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
+ _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _sums_vector[i - 1].allocator()->allocate();
+ }
+
+ // Apply ReductionOperation on the last stage
+ const unsigned int last_stage = _num_of_stages - 1;
+ _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, ReductionOperation::SUM);
+ _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _sums_vector[last_stage - 1].allocator()->allocate();
+}
+
+void CLReductionOperation::run()
+{
+ _memory_group.acquire();
+
+ for(unsigned int i = 0; i < _num_of_stages; ++i)
+ {
+ CLScheduler::get().enqueue(_border_handlers_vector[i], false);
+ CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+ }
+
+ _memory_group.release();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
index f6b1713..bc3fd4e 100644
--- a/src/runtime/CL/functions/CLRemap.cpp
+++ b/src/runtime/CL/functions/CLRemap.cpp
@@ -26,10 +26,10 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -43,7 +43,7 @@
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
- auto k = arm_compute::cpp14::make_unique<CLRemapKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLRemapKernel>();
k->configure(input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
new file mode 100644
index 0000000..2ce83dc
--- /dev/null
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLReshapeLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLReshapeLayerKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 043f873..49b0275 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -26,19 +26,14 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
using namespace arm_compute;
-void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
{
- ARM_COMPUTE_ERROR_ON(output == input);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- auto k = arm_compute::cpp14::make_unique<CLScaleKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>();
k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
index c8bc465..73f8673 100644
--- a/src/runtime/CL/functions/CLScharr3x3.cpp
+++ b/src/runtime/CL/functions/CLScharr3x3.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLScharr3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLScharr3x3Kernel>();
k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
index 6b74eba..e227e58 100644
--- a/src/runtime/CL/functions/CLSobel3x3.cpp
+++ b/src/runtime/CL/functions/CLSobel3x3.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLSobel3x3Kernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3Kernel>();
k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
index 098b546..d4bc855 100644
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp
@@ -33,8 +33,8 @@
using namespace arm_compute;
-CLSobel5x5::CLSobel5x5()
- : _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+CLSobel5x5::CLSobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
{
}
@@ -51,6 +51,8 @@
{
_tmp_x.allocator()->init(tensor_info);
_tmp_y.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_x);
+ _memory_group.manage(&_tmp_y);
_sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
@@ -59,6 +61,7 @@
else if(run_sobel_x)
{
_tmp_x.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_x);
_sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
@@ -66,6 +69,7 @@
else if(run_sobel_y)
{
_tmp_y.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_y);
_sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_y.allocator()->allocate();
@@ -76,6 +80,11 @@
void CLSobel5x5::run()
{
CLScheduler::get().enqueue(_border_handler, false);
+
+ _memory_group.acquire();
+
CLScheduler::get().enqueue(_sobel_hor, false);
CLScheduler::get().enqueue(_sobel_vert);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
index db84fa9..6083090 100644
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp
@@ -33,8 +33,8 @@
using namespace arm_compute;
-CLSobel7x7::CLSobel7x7()
- : _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+CLSobel7x7::CLSobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
{
}
@@ -51,6 +51,8 @@
{
_tmp_x.allocator()->init(tensor_info);
_tmp_y.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_x);
+ _memory_group.manage(&_tmp_y);
_sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
@@ -59,6 +61,7 @@
else if(run_sobel_x)
{
_tmp_x.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_x);
_sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
@@ -66,6 +69,7 @@
else if(run_sobel_y)
{
_tmp_y.allocator()->init(tensor_info);
+ _memory_group.manage(&_tmp_y);
_sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
_sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_y.allocator()->allocate();
@@ -76,6 +80,11 @@
void CLSobel7x7::run()
{
CLScheduler::get().enqueue(_border_handler, false);
+
+ _memory_group.acquire();
+
CLScheduler::get().enqueue(_sobel_hor, false);
CLScheduler::get().enqueue(_sobel_vert);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 2a78c58..7505a2c 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -25,29 +25,34 @@
#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
using namespace arm_compute;
-CLSoftmaxLayer::CLSoftmaxLayer()
- : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
{
}
void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
// Create intermediate tensors shapes
- _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
+ _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
TensorShape shape = input->info()->tensor_shape();
shape.set(0, 1);
- TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
+ TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
_max.allocator()->init(tensor_info_max_sum);
_sum.allocator()->init(tensor_info_max_sum);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp);
+ _memory_group.manage(&_max);
+ _memory_group.manage(&_sum);
+
// Configure Kernels
_max_kernel.configure(input, &_max);
_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
@@ -61,7 +66,11 @@
void CLSoftmaxLayer::run()
{
+ _memory_group.acquire();
+
CLScheduler::get().enqueue(_max_kernel, false);
CLScheduler::get().enqueue(_shift_exp_sum_kernel, false);
CLScheduler::get().enqueue(_norm_kernel);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
index 743ed5e..d187650 100644
--- a/src/runtime/CL/functions/CLTableLookup.cpp
+++ b/src/runtime/CL/functions/CLTableLookup.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLTableLookup.h"
#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLTableLookupKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLTableLookupKernel>();
k->configure(input, lut, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
index e70f932..1b30b77 100644
--- a/src/runtime/CL/functions/CLThreshold.cpp
+++ b/src/runtime/CL/functions/CLThreshold.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLThreshold.h"
#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
{
- auto k = arm_compute::cpp14::make_unique<CLThresholdKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLThresholdKernel>();
k->configure(input, output, threshold, false_value, true_value, type, upper);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index d802b4f..cd19e25 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLTranspose.h"
#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -32,7 +32,7 @@
void CLTranspose::configure(const ICLTensor *input, ICLTensor *output)
{
- auto k = arm_compute::cpp14::make_unique<CLTransposeKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
index 537e0d9..f785c75 100644
--- a/src/runtime/CL/functions/CLWarpAffine.cpp
+++ b/src/runtime/CL/functions/CLWarpAffine.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLWarpAffineKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLWarpAffineKernel>();
k->configure(input, output, matrix, policy);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
index a552ab4..b445b3b 100644
--- a/src/runtime/CL/functions/CLWarpPerspective.cpp
+++ b/src/runtime/CL/functions/CLWarpPerspective.cpp
@@ -24,8 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
+#include "support/ToolchainSupport.h"
#include <utility>
@@ -33,7 +33,7 @@
void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
{
- auto k = arm_compute::cpp14::make_unique<CLWarpPerspectiveKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<CLWarpPerspectiveKernel>();
k->configure(input, output, matrix, policy);
_kernel = std::move(k);
_border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));