arm_compute v19.05
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 84e8709..4c7458d 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -177,7 +177,7 @@
void CLCannyEdge::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run sobel
_sobel->run();
@@ -199,6 +199,4 @@
_l1_list_counter.clear(CLScheduler::get().queue());
_l1_stack.clear(CLScheduler::get().queue());
CLScheduler::get().enqueue(_edge_trace, true);
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 018c674..b8224d2 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,9 @@
*/
#include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
+#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
@@ -35,56 +38,168 @@
namespace arm_compute
{
CLConcatenateLayer::CLConcatenateLayer()
- : _concat_function(nullptr)
+ : _concat_kernels(),
+ _num_inputs(0),
+ _axis(Window::DimX)
{
}
-void CLConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, DataLayoutDimension axis)
+void CLConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
{
ARM_COMPUTE_ERROR_ON(output == nullptr);
+ _axis = axis;
+ _num_inputs = inputs_vector.size();
- switch(get_data_layout_dimension_index(output->info()->data_layout(), axis))
+ std::vector<ITensorInfo *> inputs_vector_info(inputs_vector.size());
+ std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](ICLTensor * t)
{
- case 0:
+ ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+ return t->info();
+ });
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+ ARM_COMPUTE_ERROR_THROW_ON(CLConcatenateLayer::validate(inputs_vector_info, output->info(), axis));
+
+ unsigned int offset = 0;
+ switch(_axis)
+ {
+ case Window::DimX:
{
- auto func = support::cpp14::make_unique<CLWidthConcatenateLayer>();
- func->configure(inputs_vector, output);
- _concat_function = std::move(func);
+ switch(_num_inputs)
+ {
+ case 2:
+ {
+ // Configure WidthConcatenate2Tensors kernel
+ auto kernel = support::cpp14::make_unique<CLWidthConcatenate2TensorsKernel>();
+ kernel->configure(inputs_vector.at(0), inputs_vector.at(1), output);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ case 4:
+ {
+ // Configure WidthConcatenate4Tensors kernel
+ auto kernel = support::cpp14::make_unique<CLWidthConcatenate4TensorsKernel>();
+ kernel->configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ default:
+ {
+ // Configure generic case WidthConcatenate kernels
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ auto kernel = support::cpp14::make_unique<CLWidthConcatenateLayerKernel>();
+ kernel->configure(inputs_vector.at(i), offset, output);
+ offset += inputs_vector.at(i)->info()->dimension(_axis);
+ _concat_kernels.emplace_back(std::move(kernel));
+ }
+ break;
+ }
+ }
break;
}
- case 2:
+ case Window::DimY:
{
- auto func = support::cpp14::make_unique<CLDepthConcatenateLayer>();
- func->configure(inputs_vector, output);
- _concat_function = std::move(func);
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ auto kernel = support::cpp14::make_unique<CLHeightConcatenateLayerKernel>();
+ kernel->configure(inputs_vector.at(i), offset, output);
+ offset += inputs_vector.at(i)->info()->dimension(_axis);
+ _concat_kernels.emplace_back(std::move(kernel));
+ }
+ break;
+ }
+ case Window::DimZ:
+ {
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ auto kernel = support::cpp14::make_unique<CLDepthConcatenateLayerKernel>();
+ kernel->configure(inputs_vector.at(i), offset, output);
+ offset += inputs_vector.at(i)->info()->dimension(_axis);
+ _concat_kernels.emplace_back(std::move(kernel));
+ }
break;
}
default:
- ARM_COMPUTE_ERROR("Concatenation is supported across width and depth only!");
+ ARM_COMPUTE_ERROR("Axis not supported");
}
}
-Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, DataLayoutDimension axis)
+Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
{
ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
+ const unsigned int num_inputs = inputs_vector.size();
- switch(get_data_layout_dimension_index(output->data_layout(), axis))
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
+
+ unsigned int offset = 0;
+ switch(axis)
{
- case 0:
- ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayer::validate(inputs_vector, output));
+ case Window::DimX:
+ {
+ switch(num_inputs)
+ {
+ case 2:
+ // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], output));
+ break;
+ case 4:
+ // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], output));
+ break;
+ default:
+ // Validate generic case of WidthConcatenate kernel
+ for(const auto &input : inputs_vector)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, offset, output));
+ offset += input->dimension(axis);
+ }
+ break;
+ }
break;
- case 2:
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayer::validate(inputs_vector, output));
+ }
+ case Window::DimY:
+ {
+ for(const auto &input : inputs_vector)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLHeightConcatenateLayerKernel::validate(input, offset, output));
+ offset += input->dimension(axis);
+ }
break;
+ }
+ case Window::DimZ:
+ {
+ for(const auto &input : inputs_vector)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayerKernel::validate(input, offset, output));
+ offset += input->dimension(axis);
+ }
+ break;
+ }
default:
- ARM_COMPUTE_RETURN_ERROR_MSG("Concatenation is supported across width and depth only!");
+ ARM_COMPUTE_ERROR("Axis not supported");
}
+
+ if(output->total_size() != 0)
+ {
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+ }
+
return Status{};
}
void CLConcatenateLayer::run()
{
- ARM_COMPUTE_ERROR_ON(_concat_function == nullptr);
- _concat_function->run();
+ for(auto &kernel : _concat_kernels)
+ {
+ CLScheduler::get().enqueue(*kernel, true);
+ }
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index 0131801..f09585e 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -58,13 +58,13 @@
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON(conv == nullptr);
- int16_t conv_col[matrix_size];
- int16_t conv_row[matrix_size];
- _is_separable = separate_matrix(conv, conv_col, conv_row, matrix_size);
+ std::array<int16_t, matrix_size> conv_col{ 0 };
+ std::array<int16_t, matrix_size> conv_row{ 0 };
+ _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
if(_is_separable)
{
- std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size);
+ std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size);
_tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
// Manage intermediate buffers
@@ -75,8 +75,8 @@
scale = calculate_matrix_scale(conv, matrix_size);
}
- _kernel_hor.configure(input, &_tmp, conv_row, border_mode == BorderMode::UNDEFINED);
- _kernel_vert.configure(&_tmp, output, conv_col, scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
+ _kernel_hor.configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
+ _kernel_vert.configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
_border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
// Allocate intermediate buffer
@@ -96,12 +96,10 @@
if(_is_separable)
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
CLScheduler::get().enqueue(_kernel_hor, false);
CLScheduler::get().enqueue(_kernel_vert);
-
- _memory_group.release();
}
else
{
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 0014e71..165d523 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -75,6 +75,13 @@
_function = std::move(f);
break;
}
+ case ConvolutionMethod::FFT:
+ {
+ auto f = arm_compute::support::cpp14::make_unique<CLFFTConvolutionLayer>(_memory_manager);
+ f->configure(input, weights, biases, output, conv_info, act_info);
+ _function = std::move(f);
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
@@ -111,6 +118,12 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups));
break;
}
+ case ConvolutionMethod::FFT:
+ {
+ // Validate FFT-based convolution layer
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
@@ -169,12 +182,20 @@
return (*found).second;
}
- if(dilation != Size2D(1U, 1U) || (input->dimension(idx_c) < 16))
+ if(dilation != Size2D(1U, 1U))
{
return ConvolutionMethod::GEMM;
}
else
{
+ if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && ( CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
+ {
+ return ConvolutionMethod::FFT;
+ }
+ if (input->dimension(idx_c) < 16)
+ {
+ return ConvolutionMethod::GEMM;
+ }
return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
}
}
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
new file mode 100644
index 0000000..b22809e
--- /dev/null
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/CLHelpers.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLCropResize.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace
+{
+inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index)
+{
+ batch_index = *(reinterpret_cast<int32_t *>(box_ind->ptr_to_element(Coordinates(crop_box_ind))));
+
+ // _crop_box_ind is used to index crop_boxes and retrieve the appropriate crop box.
+ // The crop box is specified by normalized coordinates [y0, x0, y1, x1].
+ const float x0 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(1, crop_box_ind)));
+ const float y0 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(0, crop_box_ind)));
+ const float x1 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(3, crop_box_ind)));
+ const float y1 = *reinterpret_cast<const float *>(crop_boxes->ptr_to_element(Coordinates(2, crop_box_ind)));
+ // The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers.
+ start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+ std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+ end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+ std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+ const TensorShape out_shape(input->info()->tensor_shape()[0], abs(end[0] - start[0]) + 1, abs(end[1] - start[1]) + 1);
+ output->info()->set_tensor_shape(out_shape);
+}
+
+inline void run_crop(const ICLTensor *input, ICLTensor *output, uint32_t batch_index, Coordinates start, Coordinates end, float extrapolation_value)
+{
+ bool is_width_flipped = end[0] < start[0];
+ bool is_height_flipped = end[1] < start[1];
+ /** The number of rows out of bounds at the start and end of output. */
+ std::array<int32_t, 2> rows_out_of_bounds{ 0 };
+ /** The number of columns out of bounds at the start and end of output. */
+ std::array<int32_t, 2> cols_out_of_bounds{ 0 };
+ if(is_height_flipped)
+ {
+ rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(input->info()->dimension(2)) ? std::min(start[1] - input->info()->dimension(2) + 1, output->info()->dimension(2)) : 0;
+ rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(output->info()->dimension(2))) : 0;
+ }
+ else
+ {
+ rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(output->info()->dimension(2))) : 0;
+ rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(input->info()->dimension(2)) ? std::min(end[1] - input->info()->dimension(2) + 1, output->info()->dimension(2)) : 0;
+ }
+ if(is_width_flipped)
+ {
+ cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(input->info()->dimension(1)) ? std::min(start[0] - input->info()->dimension(1) + 1, output->info()->dimension(1)) : 0;
+ cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(output->info()->dimension(1))) : 0;
+ }
+ else
+ {
+ cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(output->info()->dimension(1))) : 0;
+ cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(input->info()->dimension(1)) ? std::min(end[0] - input->info()->dimension(1) + 1, output->info()->dimension(1)) : 0;
+ }
+
+ Window full_window = calculate_max_window(*output->info());
+
+ // Full output window:
+ // --------------------------------
+ // | Out of bounds |
+ // | rows before |
+ // |------------------------------|
+ // | Out of | In | Out of |
+ // | bounds | bounds | bounds |
+ // | cols | elements | cols |
+ // | before | copied | after |
+ // | | from input | |
+ // |------------------------------|
+ // | Out of bounds |
+ // | rows after |
+ // |------------------------------|
+ // Use a separate output window for each section of the full output window.
+ // Fill all output rows that have no elements that are within the input bounds
+ // with the extrapolation value using memset.
+ // First for the rows before the in bounds rows.
+ if(rows_out_of_bounds[0] > 0)
+ {
+ Window slice_fill_rows_before(full_window);
+ slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
+ auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+ kernel->configure(output, extrapolation_value, &slice_fill_rows_before);
+ CLScheduler::get().enqueue(*kernel);
+ }
+
+ Window slice_in(full_window);
+ slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], output->info()->dimension(2) - rows_out_of_bounds[1], 1));
+ slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], output->info()->dimension(1) - cols_out_of_bounds[1], 1));
+
+ int rows_in_bounds = static_cast<int32_t>(output->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
+ if(rows_in_bounds > 0)
+ {
+ // Fill all elements that share a row with an in bounds element with the extrapolation value.
+ if(cols_out_of_bounds[0] > 0)
+ {
+ Window slice_fill_cols_before(slice_in);
+ slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
+ auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+ kernel->configure(output, extrapolation_value, &slice_fill_cols_before);
+ CLScheduler::get().enqueue(*kernel);
+ }
+
+ if(cols_out_of_bounds[1] > 0)
+ {
+ Window slice_fill_cols_after(slice_in);
+ slice_fill_cols_after.set(1, Window::Dimension(output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1), 1));
+ auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+ kernel->configure(output, extrapolation_value, &slice_fill_cols_after);
+ CLScheduler::get().enqueue(*kernel);
+ }
+
+ // Copy all elements within the input bounds from the input tensor.
+ int cols_in_bounds = static_cast<int32_t>(output->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
+ if(cols_in_bounds > 0)
+ {
+ Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
+ is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
+ Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
+ is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
+ auto kernel = arm_compute::support::cpp14::make_unique<CLCropKernel>();
+
+ kernel->configure(input, output, start_in, end_in, batch_index, extrapolation_value, &slice_in);
+ CLScheduler::get().enqueue(*kernel);
+ }
+ }
+
+ // Fill all rows after the in bounds elements with the extrapolation value.
+ if(rows_out_of_bounds[1] > 0)
+ {
+ Window slice_fill_rows_after(full_window);
+ slice_fill_rows_after.set(2, Window::Dimension(output->info()->dimension(2) - rows_out_of_bounds[1], output->info()->dimension(2), 1));
+ auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+ kernel->configure(output, extrapolation_value, &slice_fill_rows_after);
+ CLScheduler::get().enqueue(*kernel);
+ }
+}
+} // namespace
+
+CLCropResize::CLCropResize()
+ : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results()
+{
+}
+
+Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
+ Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
+ ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
+ TensorInfo temp_info;
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCropKernel::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value));
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ TensorShape out_shape(input->tensor_shape()[0], crop_size.x, crop_size.y, boxes->tensor_shape()[1]);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), out_shape);
+ }
+ return Status{};
+}
+
+void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
+ InterpolationPolicy method, float extrapolation_value)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+
+ _num_boxes = boxes->info()->tensor_shape()[1];
+ TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y);
+
+ _input = input;
+ _boxes = boxes;
+ _box_ind = box_ind;
+ _output = output;
+ _method = method;
+ _extrapolation_value = extrapolation_value;
+
+ // For each crop box:
+ // - The initial cropped image is produced as specified by boxes[i] from the 3D image input[box_ind[i]].
+ // Possibly using a CLCropKernel and up to four CLMemsetKernels.
+ // - A tensor is required to hold this initial cropped image.
+ // - A scale function is used to resize the cropped image to the size specified by crop_size.
+ // - A tensor is required to hold the final scaled image before it is copied into the 4D output
+ // that will hold all final cropped and scaled 3D images using CLCopyKernel.
+ for(unsigned int i = 0; i < _num_boxes; ++i)
+ {
+ auto crop_tensor = support::cpp14::make_unique<CLTensor>();
+ TensorInfo crop_result_info(1, DataType::F32);
+ crop_result_info.set_data_layout(DataLayout::NHWC);
+ crop_tensor->allocator()->init(crop_result_info);
+ _crop_results.emplace_back(std::move(crop_tensor));
+
+ auto scale_tensor = support::cpp14::make_unique<CLTensor>();
+ TensorInfo scaled_result_info(out_shape, 1, DataType::F32);
+ scaled_result_info.set_data_layout(DataLayout::NHWC);
+ scale_tensor->allocator()->init(scaled_result_info);
+ _scaled_results.emplace_back(std::move(scale_tensor));
+ }
+}
+
+void CLCropResize::run()
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
+ // The contents of _boxes and _box_ind are required to calculate the shape
+ // of the initial cropped image and thus are required to configure the
+ // kernels used for cropping and scaling.
+ _boxes->map(CLScheduler::get().queue());
+ _box_ind->map(CLScheduler::get().queue());
+ for(unsigned int i = 0; i < _num_boxes; ++i)
+ {
+ // Size of the crop box in _boxes and thus the shape of _crop_results[i]
+ // may not be known until run-time and so the kernels cannot be configured until then.
+ uint32_t batch_index;
+ Coordinates start{};
+ Coordinates end{};
+ configure_crop(_input, _boxes, _box_ind, _crop_results[i].get(), i, start, end, batch_index);
+
+ auto scale_kernel = support::cpp14::make_unique<CLScale>();
+ scale_kernel->configure(_crop_results[i].get(), _scaled_results[i].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT);
+ _scale.emplace_back(std::move(scale_kernel));
+
+ Window win = calculate_max_window(*_output->info());
+ win.set(3, Window::Dimension(i, i + 1, 1));
+
+ auto copy_kernel = support::cpp14::make_unique<CLCopyKernel>();
+ copy_kernel->configure(_scaled_results[i].get(), _output, PaddingList(), &win);
+ _copy.emplace_back(std::move(copy_kernel));
+
+ _crop_results[i]->allocator()->allocate();
+ _scaled_results[i]->allocator()->allocate();
+
+ run_crop(_input, _crop_results[i].get(), batch_index, start, end, _extrapolation_value);
+ }
+ _boxes->unmap(CLScheduler::get().queue());
+ _box_ind->unmap(CLScheduler::get().queue());
+ CLScheduler::get().sync();
+ for(auto &kernel : _scale)
+ {
+ kernel->run();
+ }
+ CLScheduler::get().sync();
+ for(auto &kernel : _copy)
+ {
+ CLScheduler::get().enqueue(*kernel, true);
+ }
+ CLScheduler::get().sync();
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 9da02c1..c6f79d3 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -23,188 +23,117 @@
*/
#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CPP/CPPScheduler.h"
+#include <cmath>
#include <memory>
#include <tuple>
using namespace arm_compute;
using namespace arm_compute::misc::shape_calculator;
-CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
- : _memory_group(std::move(memory_manager)),
- _scale_f(),
- _conv_f(),
- _flip_weights(),
- _scaled_output(),
- _original_weights(nullptr),
- _weights_flipped(),
- _is_prepared(false)
+CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_manager(std::move(memory_manager)), _function()
{
}
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
+ unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_UNUSED(inner_border_right, inner_border_top);
+
+ switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info))
+ {
+ case DeconvolutionMethod::DIRECT:
+ {
+ auto f = arm_compute::support::cpp14::make_unique<CLDirectDeconvolutionLayer>();
+ f->configure(input, weights, bias, output, deconv_info, weights_info);
+ _function = std::move(f);
+ break;
+ }
+ case DeconvolutionMethod::GEMM:
+ {
+ auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
+ f->configure(input, weights, bias, output, deconv_info);
+ _function = std::move(f);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported.");
+ break;
+ }
+}
+
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+ ARM_COMPUTE_UNUSED(inner_border_right, inner_border_top);
+
+ switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
+ {
+ case DeconvolutionMethod::DIRECT:
+ {
+ // Validate direct convolution layer
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
+ break;
+ }
+ case DeconvolutionMethod::GEMM:
+ {
+ // Validate gemm-based convolution layer
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported.");
+ break;
+ }
+
+ return Status{};
+}
+
+DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_UNUSED(output, bias, weights_info);
const DataLayout data_layout = input->data_layout();
const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
- ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
-
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_right > stride_x - 1, "inner_border_right must be smaller than stride_x");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y");
-
- auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
- info.pad().first, info.pad().second, stride_x, stride_y);
-
- const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
-
- if(bias != nullptr)
+ if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second)
{
- if(is_data_type_quantized_asymmetric(input->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+ return DeconvolutionMethod::DIRECT;
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], "Output's width is invalid.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
-
- unsigned int padx = 0;
- unsigned int pady = 0;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
- TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
-
- return Status{};
+ return DeconvolutionMethod::GEMM;
}
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
- unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
-
- const DataLayout data_layout = input->info()->data_layout();
-
- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
- _original_weights = weights;
- _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
- _flip_weights.configure(weights, &_weights_flipped);
-
- auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h),
- info.pad().first, info.pad().second, stride_x, stride_y);
-
- const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
-
- _is_prepared = weights_info.retain_internal_weights();
-
- _memory_group.manage(&_scaled_output);
-
- // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
- unsigned int padx = 0;
- unsigned int pady = 0;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
-
- TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
- scale_out_info.set_data_layout(data_layout);
- _scaled_output.allocator()->init(scale_out_info);
-
- // configure scale function
- const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
- _scale_f.configure(input, &_scaled_output, BorderSize(inner_border_top, inner_border_right), upsample_info);
-
- // setup the function to convolve the upscaled output
- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
- _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
- _scaled_output.allocator()->allocate();
-}
-
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
const WeightsInfo &weights_info)
{
- configure(input, weights, bias, output, info, 0, 0, weights_info);
+ configure(input, weights, bias, output, deconv_info, 0, 0, weights_info);
}
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
const WeightsInfo &weights_info)
{
- return CLDeconvolutionLayer::validate(input, weights, bias, output, info, 0, 0, weights_info);
+ return CLDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, 0, 0, weights_info);
}
void CLDeconvolutionLayer::run()
{
prepare();
-
- _memory_group.acquire();
-
- _scale_f.run();
- _conv_f.run();
-
- _memory_group.release();
+ _function->run();
}
void CLDeconvolutionLayer::prepare()
{
- if(!_is_prepared)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- // Run weights flipping and mark original weights tensor as unused
- _weights_flipped.allocator()->allocate();
- _weights_flipped.map(true);
- _original_weights->map(CLScheduler::get().queue(), true);
- CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
- _weights_flipped.unmap();
- _original_weights->unmap(CLScheduler::get().queue());
- _original_weights->mark_as_unused();
-
- // Prepare convolution
- _conv_f.prepare();
-
- if(!_weights_flipped.is_used())
- {
- _weights_flipped.allocator()->free();
- }
-
- _is_prepared = true;
- }
+ _function->prepare();
}
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index ce8667d..c66dff0 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,14 +27,11 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include <cmath>
-#include <memory>
-#include <tuple>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
: _upsample(),
+ _memset(),
_output(nullptr)
{
}
@@ -51,22 +48,13 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
_output = output;
+ _memset.configure(_output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
_upsample.configure(input, _output, inner_border, info);
}
void CLDeconvolutionLayerUpsample::run()
{
- _output->map(CLScheduler::get().queue(), true);
- if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
- {
- const uint8_t quantized_zero = _output->info()->quantization_info().offset;
- std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
- }
- else
- {
- memset(_output->buffer(), 0, _output->info()->total_size());
- }
- _output->unmap(CLScheduler::get().queue());
-
- CLScheduler::get().enqueue(_upsample, false);
+ CLScheduler::get().enqueue(_memset, false);
+ CLScheduler::get().enqueue(_upsample, true);
}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
index e46647a..f687e54 100644
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
@@ -36,8 +36,7 @@
using namespace arm_compute;
CLDepthConcatenateLayer::CLDepthConcatenateLayer() // NOLINT
- : _inputs_vector(),
- _concat_kernels_vector(),
+ : _concat_kernels_vector(),
_border_handlers_vector(),
_num_inputs(0)
{
@@ -53,10 +52,10 @@
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
}
- _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLDepthConcatenateLayerKernel[]>(_num_inputs);
- _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+ _concat_kernels_vector.resize(_num_inputs);
+ _border_handlers_vector.resize(_num_inputs);
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector_info);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector_info, Window::DimZ);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
@@ -82,7 +81,7 @@
// Output auto inizialitation if not yet initialized
TensorInfo tmp_output_info = *output->clone();
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimZ);
auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
unsigned int depth_offset = 0;
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 15cbfce..97b0a01 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -45,10 +45,18 @@
}
void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
- ActivationLayerInfo act_info)
+ ActivationLayerInfo act_info, const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ // idx_w and idx_h only used for validation
+ const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_UNUSED(idx_w);
+ ARM_COMPUTE_UNUSED(idx_h);
+
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
@@ -62,11 +70,13 @@
const ICLTensor *weights_to_use = weights;
ICLTensor *output_to_use = output;
- const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+ const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+ const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+
DepthwiseConvolutionReshapeInfo info;
info.c0 = 4;
- info.transpose = is_stride_1 && is_dot8_supported;
+ info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
if(_needs_permute)
{
@@ -103,7 +113,7 @@
// Configure kernel
_kernel->set_target(CLScheduler::get().target());
- _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info);
+ _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info, dilation);
// Permute output if needed
if(_needs_permute)
@@ -126,26 +136,26 @@
}
Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- ActivationLayerInfo act_info, GPUTarget gpu_target)
+ unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
- const bool needs_permute = is_nhwc && (depth_multiplier > 1);
- const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1);
- const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+ const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
+ const bool needs_permute = is_nhwc && (depth_multiplier > 1);
+ const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1);
+ const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
DepthwiseConvolutionReshapeInfo info;
info.c0 = 4;
- info.transpose = is_stride_1 && is_dot8_supported;
+ info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
if(needs_permute)
{
TensorShape permuted_input_shape = input->tensor_shape();
TensorShape permuted_weights_shape = weights->tensor_shape();
- TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
@@ -155,7 +165,8 @@
const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target,
+ dilation));
}
else if(is_nhwc)
{
@@ -163,13 +174,13 @@
{
auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
- act_info));
+ act_info, dilation));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation));
}
return Status{};
@@ -179,7 +190,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_needs_permute)
{
@@ -192,8 +203,6 @@
{
_permute_output_to_nhwc.run();
}
-
- _memory_group.release();
}
void CLDepthwiseConvolutionLayer3x3::prepare()
@@ -229,7 +238,7 @@
}
void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -238,12 +247,15 @@
const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
const bool can_run_optimised_3x3_kernel = (weights->info()->dimension(idx_w) == 3) && (weights->info()->dimension(idx_h) == 3);
if(bool(can_run_optimised_3x3_kernel))
{
auto f = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3>();
- f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
_optimised_function = std::move(f);
}
else
@@ -262,7 +274,7 @@
const GPUTarget gpu_target = CLScheduler::get().target();
// Calculate output shape
- TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -283,7 +295,7 @@
shape_im2col.set(2, weights_z);
_input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
_im2col_kernel.set_target(gpu_target);
- _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+ _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
CLScheduler::get().tune_kernel_static(_im2col_kernel);
// Weights reshape configuration
@@ -310,7 +322,8 @@
const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
+ int output_multiplier;
+ int output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
_output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
_output_reshaped.allocator()->allocate();
@@ -345,11 +358,14 @@
}
Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
if(can_run_optimised_3x3_kernel)
@@ -361,7 +377,7 @@
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
const bool append_bias = (biases != nullptr) && !is_quantized;
- const TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ const TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
const size_t weights_w = weights->dimension(idx_w);
const size_t weights_h = weights->dimension(idx_h);
const size_t weights_z = weights->dimension(idx_c);
@@ -375,7 +391,7 @@
shape_im2col.set(1, conv_size);
shape_im2col.set(2, weights_z);
TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
const TensorShape shape_weights_reshape(patch_size, weights_z);
TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
@@ -405,7 +421,7 @@
}
else
{
- CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation);
}
return Status{};
}
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 6f33b2e..cdfdfc7 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,36 +21,22 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
+#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
-CLDequantizationLayer::CLDequantizationLayer()
- : _dequantize_kernel()
+namespace arm_compute
{
+void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLDequantizationLayerKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
}
-Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
- ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(input, output, min_max));
-
- return Status{};
+ return CLDequantizationLayerKernel::validate(input, output);
}
-
-void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
-
- _dequantize_kernel.configure(input, output, min_max);
-}
-
-void CLDequantizationLayer::run()
-{
- // Run dequantization kernel
- CLScheduler::get().enqueue(_dequantize_kernel, false);
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
new file mode 100644
index 0000000..6e14e26
--- /dev/null
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+using namespace arm_compute::misc::shape_calculator;
+
+CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _scale_f(),
+ _conv_f(),
+ _flip_weights(),
+ _scaled_output(),
+ _original_weights(nullptr),
+ _weights_flipped(),
+ _flip_axis(),
+ _is_prepared(false)
+{
+}
+
+Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+ const DataLayout data_layout = input->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
+
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+
+ auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
+ info.pad().first, info.pad().second, stride_x, stride_y);
+
+ const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+ if(bias != nullptr)
+ {
+ if(is_data_type_quantized_asymmetric(input->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], "Output's width is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
+
+ unsigned int padx = 0;
+ unsigned int pady = 0;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, 0, 0, out_dims, padx, pady);
+ TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(), info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
+
+ return Status{};
+}
+
+void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+
+ const DataLayout data_layout = input->info()->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ _original_weights = weights;
+ _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+ _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
+
+ auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h),
+ info.pad().first, info.pad().second, stride_x, stride_y);
+
+ const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
+
+ _is_prepared = weights_info.retain_internal_weights();
+
+ _memory_group.manage(&_scaled_output);
+
+ // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
+ unsigned int padx = 0;
+ unsigned int pady = 0;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, 0, 0, out_dims, padx, pady);
+
+ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+ scale_out_info.set_data_layout(data_layout);
+ _scaled_output.allocator()->init(scale_out_info);
+
+ // configure scale function
+ const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
+ _scale_f.configure(input, &_scaled_output, BorderSize(), upsample_info);
+
+ // Setup the function to convolve the upscaled output
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
+ _scaled_output.allocator()->allocate();
+
+ // Setup flip axis data
+ _flip_axis.allocator()->allocate();
+ _flip_axis.map(true);
+ auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+ if(weights->info()->data_layout() == DataLayout::NHWC)
+ {
+ axis_data[0] = 1;
+ axis_data[1] = 2;
+ }
+ else
+ {
+ axis_data[0] = 0;
+ axis_data[1] = 1;
+ }
+ _flip_axis.unmap();
+}
+
+void CLDirectDeconvolutionLayer::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ _scale_f.run();
+ _conv_f.run();
+}
+
+void CLDirectDeconvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights flipping and mark original weights tensor as unused
+ _weights_flipped.allocator()->allocate();
+ _flip_weights.run();
+ _original_weights->mark_as_unused();
+
+ // Prepare convolution
+ _conv_f.prepare();
+
+ // Free flipped weights
+ if(!_weights_flipped.is_used())
+ {
+ _weights_flipped.allocator()->free();
+ }
+
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
new file mode 100644
index 0000000..49b5a2a
--- /dev/null
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFFT1D.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _run_scale(false)
+{
+}
+
+void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config));
+
+ // Decompose size to radix factors
+ const auto supported_radix = CLFFTRadixStageKernel::supported_radix();
+ const unsigned int N = input->info()->tensor_shape()[config.axis];
+ const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+ ARM_COMPUTE_ERROR_ON(decomposed_vector.empty());
+
+ // Flags
+ _run_scale = config.direction == FFTDirection::Inverse;
+ const bool is_c2r = input->info()->num_channels() == 2 && output->info()->num_channels() == 1;
+
+ // Configure digit reverse
+ FFTDigitReverseKernelInfo digit_reverse_config;
+ digit_reverse_config.axis = config.axis;
+ digit_reverse_config.conjugate = config.direction == FFTDirection::Inverse;
+ TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
+ _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
+ _memory_group.manage(&_digit_reversed_input);
+ _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+
+ // Create and configure FFT kernels
+ unsigned int Nx = 1;
+ _num_ffts = decomposed_vector.size();
+ _fft_kernels.resize(_num_ffts);
+ for(unsigned int i = 0; i < _num_ffts; ++i)
+ {
+ const unsigned int radix_for_stage = decomposed_vector.at(i);
+
+ FFTRadixStageKernelInfo fft_kernel_info;
+ fft_kernel_info.axis = config.axis;
+ fft_kernel_info.radix = radix_for_stage;
+ fft_kernel_info.Nx = Nx;
+ fft_kernel_info.is_first_stage = (i == 0);
+ _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+
+ Nx *= radix_for_stage;
+ }
+
+ // Configure scale kernel
+ if(_run_scale)
+ {
+ FFTScaleKernelInfo scale_config;
+ scale_config.scale = static_cast<float>(N);
+ scale_config.conjugate = config.direction == FFTDirection::Inverse;
+ is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+ }
+
+ // Allocate tensors
+ _digit_reversed_input.allocator()->allocate();
+ _digit_reverse_indices.allocator()->allocate();
+
+ // Init digit reverse indices
+ const auto digit_reverse_cpu = arm_compute::helpers::fft::digit_reverse_indices(N, decomposed_vector);
+ _digit_reverse_indices.map(CLScheduler::get().queue(), true);
+ std::copy_n(digit_reverse_cpu.data(), N, reinterpret_cast<unsigned int *>(_digit_reverse_indices.buffer()));
+ _digit_reverse_indices.unmap(CLScheduler::get().queue());
+}
+
+Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+
+ // Check if FFT is decomposable
+ const auto supported_radix = CLFFTRadixStageKernel::supported_radix();
+ const unsigned int N = input->tensor_shape()[config.axis];
+ const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+ ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+void CLFFT1D::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Run digit reverse
+ CLScheduler::get().enqueue(_digit_reverse_kernel, false);
+
+ // Run radix kernels
+ for(unsigned int i = 0; i < _num_ffts; ++i)
+ {
+ CLScheduler::get().enqueue(_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
+ }
+
+ // Run output scaling
+ if(_run_scale)
+ {
+ CLScheduler::get().enqueue(_scale_kernel, true);
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
new file mode 100644
index 0000000..165e784
--- /dev/null
+++ b/src/runtime/CL/functions/CLFFT2D.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFFT2D.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+{
+}
+
+void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config));
+
+ // Setup first pass
+ FFT1DInfo first_pass_config;
+ first_pass_config.axis = config.axes.first;
+ first_pass_config.direction = config.direction;
+ _memory_group.manage(&_first_pass_tensor);
+ _first_pass_func.configure(input, &_first_pass_tensor, first_pass_config);
+
+ // Setup second pass
+ FFT1DInfo second_pass_config;
+ second_pass_config.axis = config.axes.second;
+ second_pass_config.direction = config.direction;
+ _second_pass_func.configure(&_first_pass_tensor, output, second_pass_config);
+ _first_pass_tensor.allocator()->allocate();
+}
+
+Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT2DInfo &config)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ // Create intermediate tensor info
+ TensorInfo first_pass_tensor(input->clone()->set_is_resizable(true).reset_padding().set_num_channels(2));
+
+ // Validate first pass
+ FFT1DInfo first_pass_config;
+ first_pass_config.axis = config.axes.first;
+ first_pass_config.direction = config.direction;
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(input, &first_pass_tensor, first_pass_config));
+
+ // Validate second pass
+ FFT1DInfo second_pass_config;
+ second_pass_config.axis = config.axes.second;
+ second_pass_config.direction = config.direction;
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config));
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+void CLFFT2D::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ _first_pass_func.run();
+ _second_pass_func.run();
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
new file mode 100644
index 0000000..afb1cab
--- /dev/null
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+namespace arm_compute
+{
+namespace
+{
+int pad_decomposable(int N)
+{
+ const auto supported_radix = CLFFTRadixStageKernel::supported_radix();
+
+ int pad = 0;
+ bool is_decomposed = false;
+ while(!is_decomposed)
+ {
+ const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
+ is_decomposed = !decomposed_vector.empty();
+ if(!is_decomposed)
+ {
+ ++pad;
+ }
+ }
+ return pad;
+}
+} // namespace
+CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager),
+ _flip_weights_func(),
+ _permute_input_func(),
+ _permute_output_func(),
+ _permute_weights_func(),
+ _permute_bias_func(),
+ _pad_input_func(),
+ _pad_weights_func(),
+ _transform_input_func(memory_manager),
+ _transform_weights_func(),
+ _itransform_output_func(memory_manager),
+ _prod_func(),
+ _reduce_func(),
+ _extract_output_func(),
+ _bias_add_func(),
+ _activation_layer_func(),
+ _permuted_input(),
+ _permuted_weights(),
+ _permuted_bias(),
+ _permuted_output(),
+ _padded_input(),
+ _padded_weights(),
+ _flip_axis(),
+ _flipped_weights(),
+ _transformed_input(),
+ _transformed_weights(),
+ _input_weights_product(),
+ _output_product(),
+ _output_reduced(),
+ _itransformed_output(),
+ _reshaped_output(),
+ _bias_output(),
+ _original_weights(nullptr),
+ _original_bias(nullptr),
+ _is_activationlayer_enabled(false),
+ _needs_permute(false),
+ _has_bias(false),
+ _is_prepared(false)
+{
+}
+
+void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
+ _original_weights = weights;
+ _original_bias = biases;
+
+ // Flat if bias addition is required
+ _has_bias = biases != nullptr;
+
+ // Get indices for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+ const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+ const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+ pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+ // Tensors to use
+ ICLTensor *input_to_use = input;
+ const ICLTensor *weights_to_use = weights;
+ ICLTensor *output_to_use = _has_bias ? &_bias_output : output;
+
+ // Permute bias
+ if(biases != nullptr)
+ {
+ _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
+ _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
+ }
+
+ // Permute input if needed
+ _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
+ if(_needs_permute)
+ {
+ _memory_group.manage(&_permuted_input);
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input_func.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+ // Configure the function to transform the weights tensor from HWI -> IHW
+ _permute_weights_func.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+ input_to_use = &_permuted_input;
+ weights_to_use = &_permuted_weights;
+ }
+
+ // Flip weights
+ _flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding());
+ _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+ _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
+
+ // Pad weights
+ const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+ _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
+
+ // Transform weights
+ _transform_weights_func = support::cpp14::make_unique<CLFFT2D>();
+ _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
+
+ // Pad input
+ const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+ _memory_group.manage(&_padded_input);
+ _pad_input_func.configure(input_to_use, &_padded_input, padding_in);
+ if(_needs_permute)
+ {
+ _permuted_input.allocator()->allocate();
+ }
+
+ // Transform input
+ _memory_group.manage(&_transformed_input);
+ _transform_input_func.configure(&_padded_input, &_transformed_input, FFT2DInfo());
+ _padded_input.allocator()->allocate();
+
+ // Perform product
+ _memory_group.manage(&_output_product);
+ _prod_func.configure(&_transformed_input, &_transformed_weights, &_output_product);
+ _transformed_input.allocator()->allocate();
+
+ // Perform reduction
+ _memory_group.manage(&_output_reduced);
+ _reduce_func.configure(&_output_product, &_output_reduced, 2, ReductionOperation::SUM);
+ _output_product.allocator()->allocate();
+
+ // Transform output
+ _memory_group.manage(&_itransformed_output);
+ FFT2DInfo itranform_info;
+ itranform_info.direction = FFTDirection::Inverse;
+ _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+ _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
+ _output_reduced.allocator()->allocate();
+
+ // Reshape output
+ TensorShape reshaped_shape = _itransformed_output.info()->tensor_shape();
+ reshaped_shape.remove_dimension(2);
+ _reshaped_output.allocator()->init(_itransformed_output.info()->clone()->set_tensor_shape(reshaped_shape));
+
+ // Extract correct region
+ const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
+ const int start_top = kernel_size.y() - conv_info.pad_top() - 1;
+ const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+ const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+ if(_has_bias)
+ {
+ _memory_group.manage(&_bias_output);
+ }
+ else if(_needs_permute)
+ {
+ output_to_use = &_permuted_output;
+ _memory_group.manage(&_permuted_output);
+ }
+ _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+ _itransformed_output.allocator()->allocate();
+
+ // Add bias
+ if(biases != nullptr)
+ {
+ output_to_use = output;
+ if(_needs_permute)
+ {
+ output_to_use = &_permuted_output;
+ _memory_group.manage(&_permuted_output);
+ }
+ auto_init_if_empty(*output_to_use->info(), *_bias_output.info());
+ _bias_add_func.configure(&_bias_output, &_permuted_bias, output_to_use, ConvertPolicy::WRAP);
+ _bias_output.allocator()->allocate();
+ }
+
+ // Permute output
+ if(_needs_permute)
+ {
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+ _permute_output_func.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+ // Allocate tensors
+ _permuted_output.allocator()->allocate();
+ }
+
+ // Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+ if(_is_activationlayer_enabled)
+ {
+ _activation_layer_func.configure(output, nullptr, act_info);
+ }
+
+ // Setup flip axis data
+ _flip_axis.allocator()->allocate();
+ _flip_axis.map(true);
+ auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+ axis_data[0] = 0;
+ axis_data[1] = 1;
+ _flip_axis.unmap();
+}
+
+Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+ // Get indices for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+
+ // Strides
+ const auto strides = conv_info.stride();
+ ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+
+ // Validate biases
+ if(biases != nullptr)
+ {
+ const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channels] != biases->tensor_shape().x());
+ }
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+
+ // Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ }
+ }
+
+ return Status{};
+}
+
+void CLFFTConvolutionLayer::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Transform input
+ if(_needs_permute)
+ {
+ _permute_input_func.run();
+ }
+ _pad_input_func.run();
+ _transform_input_func.run();
+
+ // Perform operations to frequency domain
+ _prod_func.run();
+ _reduce_func.run();
+
+ // Transform output
+ _itransform_output_func.run();
+ _reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer());
+ _extract_output_func.run();
+ // Add bias
+ if(_has_bias)
+ {
+ _bias_add_func.run();
+ }
+ if(_needs_permute)
+ {
+ _permute_output_func.run();
+ }
+
+ // Run activation layer
+ if(_is_activationlayer_enabled)
+ {
+ _activation_layer_func.run();
+ }
+}
+
+void CLFFTConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Permute bias to NCHW
+ if(_original_bias != nullptr)
+ {
+ _permuted_bias.allocator()->allocate();
+ _permute_bias_func.run();
+ _original_bias->mark_as_unused();
+ }
+
+ const ICLTensor *cur_weights = _original_weights;
+ // Permute weights
+ if(_needs_permute)
+ {
+ ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
+
+ _permuted_weights.allocator()->allocate();
+ _permute_weights_func.run();
+ cur_weights->mark_as_unused();
+ cur_weights = &_permuted_weights;
+ }
+
+ // Flip weights
+ _flipped_weights.allocator()->allocate();
+ _flip_weights_func.run();
+ cur_weights->mark_as_unused();
+
+ // Pad weights
+ _padded_weights.allocator()->allocate();
+ _pad_weights_func.run();
+ _flipped_weights.mark_as_unused();
+ CLScheduler::get().queue().finish();
+ _flipped_weights.allocator()->free();
+
+ // Transform weights to frequency domain
+ _transformed_weights.allocator()->allocate();
+ _transform_weights_func->run();
+ _padded_weights.mark_as_unused();
+ CLScheduler::get().queue().finish();
+ // Delete object and release internal memory
+ _transform_weights_func.reset();
+ _padded_weights.allocator()->free();
+
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
index d6cda91..fe2a18c 100644
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ b/src/runtime/CL/functions/CLFastCorners.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -97,7 +97,7 @@
{
cl::CommandQueue q = CLScheduler::get().queue();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_non_max)
{
@@ -129,6 +129,4 @@
}
q.flush();
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 6a2aac6..7b9229c 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -231,7 +231,8 @@
if(_is_quantized)
{
float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
- int output_multiplier, output_shift;
+ int output_multiplier;
+ int output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
_gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
_gemmlowp_output.allocator()->allocate();
@@ -333,7 +334,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Linearize input if it comes from a convolutional layer
if(_is_fc_after_conv)
@@ -363,8 +364,6 @@
CLScheduler::get().enqueue(_accumulate_biases_kernel);
}
}
-
- _memory_group.release();
}
void CLFullyConnectedLayer::prepare()
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index e91038f..492709f 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -23,7 +23,10 @@
*/
#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/GPUTarget.h"
#include "arm_compute/core/Helpers.h"
@@ -33,7 +36,6 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
#include "arm_compute/runtime/ITensorAllocator.h"
namespace arm_compute
@@ -41,46 +43,6 @@
using namespace arm_compute::misc::shape_calculator;
using namespace arm_compute::cl_gemm;
-namespace
-{
-inline bool is_interleaved_transposed(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
-{
- bool flag = true;
-
- if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
- {
- if((m > 1) && n < 16)
- {
- flag = true;
- }
- else
- {
- // COMPMID-852
- if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
- {
- constexpr float alpha = 3.2f;
- constexpr float fact0 = 1.51f;
- constexpr float fact1 = 1.66f;
- constexpr float ops = 12.0f;
- const float scale = k > 1024 ? 1.07f : 1.0f;
- flag = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
- }
- else
- {
- flag = false;
- }
- }
- }
- else
- {
- // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
- flag = m != 1 && reshape_b_only_on_first_run;
- }
-
- return flag;
-}
-} // namespace
-
CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
_mm_kernel(),
@@ -88,57 +50,102 @@
_reshape_lhs_kernel(),
_reshape_rhs_kernel(),
_mm_reshaped_kernel(),
+ _mm_reshaped_only_rhs_kernel(),
_tmp_a(),
_tmp_b(),
_original_b(nullptr),
- _is_interleaved_transposed(false),
_run_addition(false),
_reshape_b_only_on_first_run(false),
_is_prepared(false),
- _is_new_gemm_reshaped(false)
+ _gemm_type(GEMMType::NATIVE)
{
}
-void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+CLGEMM::GEMMType CLGEMM::select_gemm_type(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+ GEMMType gemm_type = GEMMType::RESHAPED_V1;
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
+ if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
+ {
+ if((m > 1) && (n < 16))
+ {
+ gemm_type = GEMMType::RESHAPED_V1;
+ }
+ else if((m == 1) && (data_type == DataType::F32))
+ {
+ gemm_type = GEMMType::RESHAPED_ONLY_RHS;
+ }
+ else
+ {
+ // COMPMID-852
+ if((k > 256) && (m > 4) && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+ {
+ constexpr float alpha = 3.2f;
+ constexpr float fact0 = 1.51f;
+ constexpr float fact1 = 1.66f;
+ constexpr float ops = 12.0f;
+ const float scale = k > 1024 ? 1.07f : 1.0f;
+ gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
+ }
+ else
+ {
+ gemm_type = GEMMType::NATIVE;
+ }
+ }
- // Check if we need to reshape the matrix B only on the first run
- _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
- _is_prepared = gemm_info.retain_internal_weights();
- _original_b = b;
+ const auto workload = static_cast<float>((m * n) / 20.0f);
- const ICLTensor *matrix_a = a;
- const ICLTensor *matrix_b = b;
+ gemm_type = ((workload > 1600.0f) && (gemm_type == GEMMType::RESHAPED_V1) && (data_type == DataType::F32)) ? GEMMType::RESHAPED_V2 : gemm_type;
+ }
+ else
+ {
+ // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
+ gemm_type = ((m != 1) && reshape_b_only_on_first_run) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
+ }
- // Get the GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
+ return gemm_type;
+}
+
+void CLGEMM::configure_native(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+ const unsigned int n = b->info()->dimension(0);
+ const unsigned int k = a->info()->dimension(0);
+ const GPUTarget gpu_target = CLScheduler::get().target();
// Set the target for the kernels
- _reshape_lhs_kernel.set_target(gpu_target);
_mm_kernel.set_target(gpu_target);
- // Arguments used by GEMMReshapeInfo
- // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
- // in order to know how the matrices have been reshaped
- DataType data_type = a->info()->data_type();
+ GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d());
+
+ // Configure and tune matrix multiply kernel
+ _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision());
+
+ // Tune kernel statically
+ CLScheduler::get().tune_kernel_static(_mm_kernel);
+}
+
+void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
const unsigned int n = b->info()->dimension(0);
const unsigned int k = a->info()->dimension(0);
- const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const GPUTarget gpu_target = CLScheduler::get().target();
int mult_transpose1xW_width = 1;
int mult_interleave4x4_height = 1;
+ // Set the target for the kernels
+ _reshape_lhs_kernel.set_target(gpu_target);
+ _mm_kernel.set_target(gpu_target);
+
if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
{
mult_transpose1xW_width = 4;
mult_interleave4x4_height = 2;
}
+
GEMMRHSMatrixInfo rhs_info;
rhs_info.n0 = 16 / b->info()->element_size();
rhs_info.k0 = 1;
@@ -153,112 +160,183 @@
lhs_info.interleave = true;
lhs_info.transpose = true;
- // Check if we need to reshape the matrix A and matrix B
- _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
+ GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
- // Check if we can run the new reshaped GEMM
- const auto workload = static_cast<float>((m * n) / 20.0f);
- _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32);
-
- const bool add_matrix_c = (beta != 0.f && c != nullptr);
- const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
- const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped;
-
- // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
- if(_is_interleaved_transposed)
+ _memory_group.manage(&_tmp_a);
+ if(!_reshape_b_only_on_first_run)
{
- reinterpret_input_as_3d = false;
-
- matrix_a = &_tmp_a;
- matrix_b = &_tmp_b;
-
- // Manage intermediate buffers
- _memory_group.manage(&_tmp_a);
- if(!_reshape_b_only_on_first_run)
- {
- _memory_group.manage(&_tmp_b);
- }
- // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
-
- if(_is_new_gemm_reshaped)
- {
- GEMMLHSMatrixInfo lhs_info;
-
- // Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
-
- _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
- _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
-
- // Configure and tune matrix multiply kernel
- _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
- depth_output_gemm3d, reinterpret_input_as_3d));
- }
- else
- {
- // Configure interleave kernel
- _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
- // Configure transpose kernel
- _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
- }
+ _memory_group.manage(&_tmp_b);
}
- if(!_is_new_gemm_reshaped)
- {
- // Configure and tune matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed,
- GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d),
- gemm_info.fp_mixed_precision());
- CLScheduler::get().tune_kernel_static(_mm_kernel);
- }
+ // Configure interleave kernel
+ _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
- if(_is_interleaved_transposed)
- {
- // Allocate intermediate tensors
- _tmp_a.allocator()->allocate();
- if(!_reshape_b_only_on_first_run)
- {
- _tmp_b.allocator()->allocate();
- }
- }
+ // Configure transpose kernel
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
- // Configure matrix addition kernel
- if(add_matrix_c && !use_fused_add)
+ // Configure and tune matrix multiply kernel
+ _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision());
+
+ CLScheduler::get().tune_kernel_static(_mm_kernel);
+
+ // Allocate intermediate tensors
+ _tmp_a.allocator()->allocate();
+ if(!_reshape_b_only_on_first_run)
{
- _ma_kernel.configure(c, output, beta);
- _run_addition = true;
+ _tmp_b.allocator()->allocate();
}
}
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure_reshaped_v2(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON(c != nullptr);
+ ARM_COMPUTE_UNUSED(beta);
+ ARM_COMPUTE_UNUSED(c);
+
+ DataType data_type = a->info()->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+ const unsigned int n = b->info()->dimension(0);
+ const unsigned int k = a->info()->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ // Set the target for the kernels
+ _reshape_lhs_kernel.set_target(gpu_target);
+ _mm_kernel.set_target(gpu_target);
+
+ GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, false);
+
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp_a);
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
+ // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
+
+ GEMMLHSMatrixInfo lhs_info{};
+ GEMMRHSMatrixInfo rhs_info{};
+
+ // Pick up the GEMM configuration
+ std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+
+ // Configure lhs_info and rhs_info
+ std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+ _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_kernel.configure(&_tmp_a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
+
+ // Allocate intermediate tensors
+ _tmp_a.allocator()->allocate();
+ if(!_reshape_b_only_on_first_run)
+ {
+ _tmp_b.allocator()->allocate();
+ }
+}
+
+void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON(c != nullptr);
+ ARM_COMPUTE_UNUSED(beta);
+ ARM_COMPUTE_UNUSED(c);
+
+ DataType data_type = a->info()->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+ const unsigned int n = b->info()->dimension(0);
+ const unsigned int k = a->info()->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ // Set the target for the kernels
+ _mm_kernel.set_target(gpu_target);
+
+ GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+ // Manage intermediate buffers
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
+
+ GEMMLHSMatrixInfo lhs_info{};
+ GEMMRHSMatrixInfo rhs_info{};
+
+ // Pick up the GEMM configuration
+ std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+
+ // Configure lhs_info and rhs_info
+ std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_only_rhs_kernel.configure(a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
+
+ if(!_reshape_b_only_on_first_run)
+ {
+ _tmp_b.allocator()->allocate();
+ }
+}
+
+Status CLGEMM::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_UNUSED(alpha);
ARM_COMPUTE_UNUSED(output);
- // Check if we need to reshape the matrix B only on the first run
- const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const bool add_c = (beta != 0.f && c != nullptr);
+ const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+ const bool fuse_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
- const ITensorInfo *matrix_a_info = a;
- const ITensorInfo *matrix_b_info = b;
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
+ false, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+
+ if(add_c && !fuse_add)
+ {
+ // Validate matrix addition kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+ }
+
+ return Status{};
+}
+
+Status CLGEMM::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(output);
TensorInfo tmp_a_info{};
TensorInfo tmp_b_info{};
// Get the GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
-
- // Arguments used by GEMMReshapeInfo
- // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
- // in order to know how the matrices have been reshaped
- DataType data_type = a->data_type();
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
const unsigned int n = b->dimension(0);
const unsigned int k = a->dimension(0);
- const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
int mult_transpose1xW_width = 1;
int mult_interleave4x4_height = 1;
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const bool add_c = (beta != 0.f && c != nullptr);
+ const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+ const bool fuse_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
{
@@ -280,66 +358,21 @@
lhs_info.interleave = true;
lhs_info.transpose = true;
- // Check if we need to reshape the matrix A and matrix B
- const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
- // Check if we can run the new reshaped GEMM
- const auto workload = static_cast<float>((m * n) / 20.0f);
- const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32);
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
- const bool add_matrix_c = (beta != 0.f && c != nullptr);
- const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
- const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped;
+ // Validate transpose kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
- // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
- if(run_interleave_transpose)
- {
- reinterpret_input_as_3d = false;
- }
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
+ true, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
-
- if(run_interleave_transpose)
- {
- matrix_a_info = &tmp_a_info;
- matrix_b_info = &tmp_b_info;
-
- if(is_new_gemm_reshaped)
- {
- GEMMLHSMatrixInfo lhs_info;
-
- // Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
-
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
- depth_output_gemm3d, reinterpret_input_as_3d)));
- }
- else
- {
- // Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
- // Validate transpose kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
- }
- }
-
- if(!is_new_gemm_reshaped)
- {
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta,
- run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
- }
-
- if(add_matrix_c && !use_fused_add)
+ if(add_c && !fuse_add)
{
// Validate matrix addition kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
@@ -348,32 +381,263 @@
return Status{};
}
+Status CLGEMM::validate_reshaped_v2(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(output);
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ DataType data_type = a->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const bool add_c = (beta != 0.f && c != nullptr);
+
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, false);
+
+ GEMMLHSMatrixInfo lhs_info;
+ GEMMRHSMatrixInfo rhs_info;
+
+ // Pick up the GEMM configuration
+ std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
+
+ // Configure lhs_info and rhs_info
+ std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
+
+ if(add_c)
+ {
+ // Validate matrix addition kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+ }
+
+ return Status{};
+}
+
+Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(output);
+
+ TensorInfo tmp_b_info{};
+
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ const DataType data_type = a->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const bool add_c = (beta != 0.f && c != nullptr);
+
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+ GEMMLHSMatrixInfo lhs_info;
+ GEMMRHSMatrixInfo rhs_info;
+
+ // Pick up the GEMM configuration
+ std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
+
+ // Configure lhs_info and rhs_info
+ std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
+
+ if(add_c)
+ {
+ // Validate matrix addition kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+ }
+
+ return Status{};
+}
+
+void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
+
+ // Check if we need to reshape the matrix B only on the first run
+ _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ _is_prepared = gemm_info.retain_internal_weights();
+ _original_b = b;
+
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+ const unsigned int n = b->info()->dimension(0);
+ const unsigned int k = a->info()->dimension(0);
+
+ // Select GEMMType
+ _gemm_type = select_gemm_type(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
+
+ const bool is_gemm_v2 = (_gemm_type == GEMMType::RESHAPED_V2) || (_gemm_type == GEMMType::RESHAPED_ONLY_RHS);
+ const bool add_c = (beta != 0.f && c != nullptr);
+ const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+ const bool fuse_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !is_gemm_v2;
+
+ switch(_gemm_type)
+ {
+ case GEMMType::NATIVE:
+ {
+ configure_native(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+ break;
+ }
+ case GEMMType::RESHAPED_V1:
+ {
+ configure_reshaped_v1(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+ break;
+ }
+ case GEMMType::RESHAPED_V2:
+ {
+ configure_reshaped_v2(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+ break;
+ }
+ case GEMMType::RESHAPED_ONLY_RHS:
+ {
+ configure_reshaped_only_rhs(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("GEMMType not supported");
+ }
+ }
+
+ // Configure matrix addition kernel
+ if(add_c && !fuse_add)
+ {
+ _ma_kernel.configure(c, output, beta);
+ _run_addition = true;
+ }
+}
+
+Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+
+ // Select GEMMType
+ GEMMType gemm_type = select_gemm_type(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run(), gpu_target);
+
+ switch(gemm_type)
+ {
+ case GEMMType::NATIVE:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c, output, alpha, beta, gemm_info));
+ break;
+ }
+ case GEMMType::RESHAPED_V1:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c, output, alpha, beta, gemm_info));
+ break;
+ }
+ case GEMMType::RESHAPED_V2:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v2(a, b, c, output, alpha, beta, gemm_info));
+ break;
+ }
+ case GEMMType::RESHAPED_ONLY_RHS:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c, output, alpha, beta, gemm_info));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");
+ }
+ }
+
+ return Status{};
+}
+
void CLGEMM::run()
{
prepare();
- _memory_group.acquire();
-
- if(_is_interleaved_transposed)
- {
- // Run interleave kernel
- CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
-
- if(!_reshape_b_only_on_first_run)
- {
- // Run transpose kernel
- CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
- }
- }
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run matrix multiply kernel
- if(_is_new_gemm_reshaped)
+ switch(_gemm_type)
{
- CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
- }
- else
- {
- CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+ case GEMMType::NATIVE:
+ {
+ CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+ break;
+ }
+ case GEMMType::RESHAPED_V1:
+ {
+ // Run interleave kernel
+ CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+
+ if(!_reshape_b_only_on_first_run)
+ {
+ // Run transpose kernel
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+ }
+
+ CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+ break;
+ }
+ case GEMMType::RESHAPED_V2:
+ {
+ // Run interleave kernel
+ CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+
+ if(!_reshape_b_only_on_first_run)
+ {
+ // Run transpose kernel
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+ }
+
+ CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
+ break;
+ }
+ case GEMMType::RESHAPED_ONLY_RHS:
+ {
+ if(!_reshape_b_only_on_first_run)
+ {
+ // Run transpose kernel
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+ }
+
+ CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, !_run_addition);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("GEMMType not supported");
+ }
}
// Run matrix addition kernel
@@ -381,15 +645,13 @@
{
CLScheduler::get().enqueue(_ma_kernel);
}
-
- _memory_group.release();
}
void CLGEMM::prepare()
{
if(!_is_prepared)
{
- if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+ if(_gemm_type != GEMMType::NATIVE && _reshape_b_only_on_first_run)
{
// Run transpose kernel and mark original weights tensor as unused
_tmp_b.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 7105e85..03d516f 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -262,7 +262,7 @@
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, conv_w * conv_h);
- // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
+ // TODO(COMPMID-2078): input->clone() doesn't work with subtensors for grouped convolutions.
TensorInfo info_gemm(shape_gemm, 1, data_type);
info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
_gemm_output.allocator()->init(info_gemm);
@@ -372,7 +372,9 @@
const unsigned int kernel_width = weights->dimension(idx_width);
const unsigned int kernel_height = weights->dimension(idx_height);
- TensorInfo im2col_reshaped_info, info_gemm, weights_reshaped_info;
+ TensorInfo im2col_reshaped_info{};
+ TensorInfo info_gemm{};
+ TensorInfo weights_reshaped_info{};
const ITensorInfo *gemm_input_to_use = input;
const ITensorInfo *gemm_output_to_use = output;
const ITensorInfo *weights_to_use = weights;
@@ -526,7 +528,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run im2col
if(!_skip_im2col)
@@ -562,8 +564,6 @@
{
_activationlayer_function.run();
}
-
- _memory_group.release();
}
void CLGEMMConvolutionLayer::prepare()
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
new file mode 100644
index 0000000..bcb91e0
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "utils/TypePrinter.h"
+
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+namespace
+{
+std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
+{
+ Coordinates start;
+ Coordinates end;
+
+ if(is_nchw)
+ {
+ start.set(0, deconv_info.pad_left());
+ start.set(1, deconv_info.pad_top());
+ end.set(0, output_info.dimension(0) - deconv_info.pad_right());
+ end.set(1, output_info.dimension(1) - deconv_info.pad_bottom());
+ }
+ else
+ {
+ start.set(0, 0);
+ start.set(1, deconv_info.pad_left());
+ start.set(2, deconv_info.pad_top());
+
+ end.set(0, output_info.dimension(0));
+ end.set(1, output_info.dimension(1) - deconv_info.pad_right());
+ end.set(2, output_info.dimension(2) - deconv_info.pad_bottom());
+ }
+
+ return { start, end };
+}
+} // namespace
+
+CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _mm_gemm(),
+ _mm_gemmlowp(),
+ _gemmlowp_output_stage(),
+ _permute_input_to_nhwc(),
+ _permute_weights_to_nhwc(),
+ _reshape_weights(),
+ _transpose_weights(),
+ _deconv_reshape(),
+ _slice_gemm(),
+ _gemmlowp_final(),
+ _reshaped_weights(),
+ _reshaped_weights_t(),
+ _permuted_input(),
+ _permuted_weights(),
+ _gemm_output(),
+ _slice_gemm_input(),
+ _original_weights(),
+ _is_prepared(false),
+ _padded_input(false),
+ _is_nchw(false),
+ _is_quantized(false)
+{
+}
+
+Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+ DataLayout data_layout = input->data_layout();
+ const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+ const bool is_nchw = input->data_layout() == DataLayout::NCHW;
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_b = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != deconv_info.stride().first);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) != deconv_info.stride().second);
+
+ TensorShape nhwc_weights_shape = weights->tensor_shape();
+ TensorShape nhwc_input_shape = input->tensor_shape();
+
+ if(is_nchw)
+ {
+ permute(nhwc_weights_shape, PermutationVector(2, 0, 1));
+ permute(nhwc_input_shape, PermutationVector(2, 0, 1));
+
+ TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW);
+
+ TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW);
+
+ CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1));
+ CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1));
+ }
+
+ const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
+ const TensorInfo reshaped_info = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info));
+
+ TensorShape transposed_shape(reshaped_shape[1], reshaped_shape[0]);
+ const TensorInfo reshaped_t_info = reshaped_info.clone()->set_is_resizable(true).set_tensor_shape(transposed_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info));
+
+ TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b),
+ input->dimension(idx_w),
+ input->dimension(idx_h),
+ input->dimension(idx_b));
+
+ TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true);
+ GEMMInfo gemm_info(false, false, true, input->dimension(idx_h), true);
+
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32),
+ gemm_info));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
+ }
+
+ auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
+ 0, 0, deconv_info.stride().first, deconv_info.stride().second);
+ const TensorShape deconv_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+ TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
+
+ if(padded_input && is_quantized)
+ {
+ const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&col2im_output_info, nullptr,
+ &col2im_output_info.clone()->set_is_resizable(true).set_data_type(DataType::QASYMM8)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(DataType::QASYMM8), output, start_end.first, start_end.second));
+ }
+ else if(padded_input)
+ {
+ const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second));
+ }
+ else if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&col2im_output_info, nullptr, output));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
+ }
+
+ return Status{};
+}
+
+void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(),
+ weights->info(),
+ bias != nullptr ? bias->info() : nullptr,
+ output->info(),
+ deconv_info));
+
+ _original_weights = weights;
+ _padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+ _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+
+ const ICLTensor *input_to_use = input;
+ const ICLTensor *weights_to_use = weights;
+
+ // If the data layout is NCHW, transform everything in NHWC. Another alternative could be to
+ // do an outer product in NCHW and then an accumulation through a reduction. This would have two
+ // drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction
+ // might be slower than GEMM.
+ if(_is_nchw)
+ {
+ _memory_group.manage(&_permuted_input);
+ _permute_input_to_nhwc.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+
+ _permute_weights_to_nhwc.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+
+ input_to_use = &_permuted_input;
+ weights_to_use = &_permuted_weights;
+ }
+
+ // Reshape the input weights. The weights will be reshaped only once during the call to prepare()
+ _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0),
+ weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)),
+ 1,
+ input->info()->data_type(), weights->info()->quantization_info()));
+
+ _reshape_weights.configure(weights_to_use, &_reshaped_weights);
+ _transpose_weights.configure(&_reshaped_weights, &_reshaped_weights_t);
+
+ const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ GEMMInfo gemm_info(false, false, true, input->info()->dimension(idx_h), true);
+
+ // Configure output stage for asymmetric quantized types
+ if(_is_quantized)
+ {
+ _mm_gemmlowp.configure(input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
+ }
+ else
+ {
+ _mm_gemm.configure(input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
+ }
+
+ if(_is_nchw)
+ {
+ _permuted_input.allocator()->allocate();
+ }
+
+ ICLTensor *deconv_reshape_output = nullptr;
+ ICLTensor *slice_output = nullptr;
+ ICLTensor *output_stage_output = nullptr;
+
+ if(_padded_input && _is_quantized)
+ {
+ _memory_group.manage(&_slice_gemm_input);
+ _memory_group.manage(&_gemmlowp_final);
+ deconv_reshape_output = &_gemmlowp_final;
+ output_stage_output = &_slice_gemm_input;
+ slice_output = output;
+ }
+ else if(_padded_input)
+ {
+ _memory_group.manage(&_slice_gemm_input);
+ deconv_reshape_output = &_slice_gemm_input;
+ slice_output = output;
+ }
+ else if(_is_quantized)
+ {
+ _memory_group.manage(&_gemmlowp_final);
+ deconv_reshape_output = &_gemmlowp_final;
+ output_stage_output = output;
+ }
+ else
+ {
+ deconv_reshape_output = output;
+ }
+
+ // Configure a Col2Im call to reshape the output of GEMM
+ _deconv_reshape.configure(&_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+ _gemm_output.allocator()->allocate();
+
+ if(_is_quantized)
+ {
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / _gemmlowp_final.info()->quantization_info().scale;
+ int output_multiplier(0);
+ int output_shift(0);
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _gemmlowp_output_stage.configure(&_gemmlowp_final, nullptr, output_stage_output, output_multiplier, output_shift, _gemmlowp_final.info()->quantization_info().offset);
+ _gemmlowp_final.allocator()->allocate();
+ }
+
+ // If the input was padded, the output needs to be sliced.
+ if(_padded_input)
+ {
+ const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
+ _slice_gemm.configure(&_slice_gemm_input, slice_output, start_end.first, start_end.second);
+ _slice_gemm_input.allocator()->allocate();
+ }
+}
+
+void CLGEMMDeconvolutionLayer::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ if(_is_nchw)
+ {
+ _permute_input_to_nhwc.run();
+ }
+
+ if(_is_quantized)
+ {
+ _mm_gemmlowp.run();
+ }
+ else
+ {
+ _mm_gemm.run();
+ }
+
+ CLScheduler::get().enqueue(_deconv_reshape, false);
+
+ if(_is_quantized)
+ {
+ _gemmlowp_output_stage.run();
+ }
+
+ if(_padded_input)
+ {
+ _slice_gemm.run();
+ }
+}
+
+void CLGEMMDeconvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ if(_is_nchw)
+ {
+ _permuted_weights.allocator()->allocate();
+ _permute_weights_to_nhwc.run();
+ }
+
+ _reshaped_weights.allocator()->allocate();
+ _reshape_weights.run();
+
+ if(_is_nchw)
+ {
+ _permuted_weights.allocator()->free();
+ }
+
+ _reshaped_weights_t.allocator()->allocate();
+ _transpose_weights.run();
+
+ // Prepare gemm
+ if(!_is_quantized)
+ {
+ _mm_gemm.prepare();
+ }
+ else
+ {
+ _mm_gemmlowp.prepare();
+ }
+
+ // Free resources
+ if(!_reshaped_weights_t.is_used())
+ {
+ _reshaped_weights_t.allocator()->free();
+ }
+
+ _original_weights->mark_as_unused();
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 2a01db7..049db1d 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
@@ -31,7 +32,6 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
namespace arm_compute
{
@@ -40,17 +40,16 @@
namespace
{
-inline bool is_gemm_reshaped(unsigned int m, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
{
- return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (m > 1) && (reshape_b_only_on_first_run);
+ return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
}
} // namespace
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
_mm_kernel(),
- _mm_reshaped_kernel(),
- _mtx_a_reshape_kernel(),
+ _mm_reshaped_only_rhs_kernel(),
_mtx_b_reshape_kernel(),
_mtx_a_reduction_kernel(),
_mtx_b_reduction_kernel(),
@@ -58,7 +57,6 @@
_offset_contribution_output_stage_kernel(),
_vector_sum_col(),
_vector_sum_row(),
- _tmp_a(),
_tmp_b(),
_mm_result_s32(),
_original_b(nullptr),
@@ -86,7 +84,6 @@
const GPUTarget gpu_target = CLScheduler::get().target();
// Set the target for the kernels
- _mtx_a_reshape_kernel.set_target(gpu_target);
_mm_kernel.set_target(gpu_target);
const ICLTensor *matrix_a = a;
@@ -105,29 +102,21 @@
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
// Check if we need to reshape the matrix A and matrix B
- _is_gemm_reshaped = is_gemm_reshaped(m, _reshape_b_only_on_first_run, gpu_target);
+ _is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target);
if(_is_gemm_reshaped)
{
- // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
- reinterpret_input_as_3d = false;
-
- matrix_a = &_tmp_a;
matrix_b = &_tmp_b;
- _memory_group.manage(&_tmp_a);
if(!_reshape_b_only_on_first_run)
{
_memory_group.manage(&_tmp_b);
}
// Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
- // Configure interleave kernel
- _mtx_a_reshape_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-
- // Configure transpose kernel
+ // Configure reshape RHS kernel
_mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info);
}
@@ -166,7 +155,7 @@
if(_is_gemm_reshaped)
{
// Configure and tune matrix multiply kernel
- _mm_reshaped_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ _mm_reshaped_only_rhs_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
}
else
{
@@ -185,7 +174,7 @@
if(_is_gemm_reshaped)
{
// Configure and tune matrix multiply kernel
- _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ _mm_reshaped_only_rhs_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
}
else
{
@@ -200,7 +189,6 @@
// Allocate tensors
if(_is_gemm_reshaped)
{
- _tmp_a.allocator()->allocate();
if(!_reshape_b_only_on_first_run)
{
_tmp_b.allocator()->allocate();
@@ -231,11 +219,13 @@
const ITensorInfo *matrix_a_info = a;
const ITensorInfo *matrix_b_info = b;
- TensorInfo tmp_a_info{};
TensorInfo tmp_b_info{};
GEMMRHSMatrixInfo rhs_info;
GEMMLHSMatrixInfo lhs_info;
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
const unsigned int n = b->dimension(0);
@@ -243,35 +233,24 @@
const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- bool reshape_matrices = is_gemm_reshaped(m, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
-
- // if reshape_matrices is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
- if(reshape_matrices)
- {
- reinterpret_input_as_3d = false;
- }
+ bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
- if(reshape_matrices)
+ if(reshape_matrix_b)
{
- matrix_a_info = &tmp_a_info;
matrix_b_info = &tmp_b_info;
// Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
- // Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-
- // Validate transpose kernel
-
+ // Validate reshape RHS kernel
auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
}
- TensorInfo info_vector_sum_col, info_vector_sum_row;
+ TensorInfo info_vector_sum_col{};
+ TensorInfo info_vector_sum_row{};
// Validate matrix B reduction kernel only if _a_offset is not equal to 0
if(a_offset != 0)
@@ -295,13 +274,13 @@
{
TensorInfo mm_result_s32_info{};
- if(reshape_matrices)
+ if(reshape_matrix_b)
{
// Output tensor auto inizialitation if not yet initialized
auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
// Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
}
else
{
@@ -322,22 +301,25 @@
}
else
{
- if(reshape_matrices)
+ if(reshape_matrix_b)
{
// Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
}
else
{
// Validate matrix multiply
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info));
}
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- c,
- a_offset, b_offset));
+ if(output->total_size() != 0)
+ {
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ c,
+ a_offset, b_offset));
+ }
}
return Status{};
@@ -347,13 +329,10 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_is_gemm_reshaped)
{
- // Run reshape matrix A
- CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
-
if(!_reshape_b_only_on_first_run)
{
// Run reshape matrix B
@@ -370,7 +349,7 @@
// Run matrix multiply
if(_is_gemm_reshaped)
{
- CLScheduler::get().enqueue(_mm_reshaped_kernel, false);
+ CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, false);
}
else
{
@@ -393,8 +372,6 @@
// Run offset contribution kernel
CLScheduler::get().enqueue(_offset_contribution_kernel, true);
}
-
- _memory_group.release();
}
void CLGEMMLowpMatrixMultiplyCore::prepare()
@@ -422,4 +399,4 @@
_is_prepared = true;
}
}
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
index f30eee1..ea803e4 100644
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,10 +62,8 @@
{
CLScheduler::get().enqueue(_border_handler, false);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
CLScheduler::get().enqueue(_kernel_hor, false);
CLScheduler::get().enqueue(_kernel_vert);
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index fd82769..b671b23 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -76,10 +76,10 @@
if(num_levels > 1)
{
- _horizontal_border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
- _vertical_border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
- _horizontal_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
- _vertical_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+ _horizontal_border_handler.resize(num_levels - 1);
+ _vertical_border_handler.resize(num_levels - 1);
+ _horizontal_reduction.resize(num_levels - 1);
+ _vertical_reduction.resize(num_levels - 1);
// Apply half scale to the X dimension of the tensor shape
TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -153,8 +153,8 @@
if(num_levels > 1)
{
- _gauss5x5 = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
- _scale_nearest = arm_compute::support::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
+ _gauss5x5.resize(num_levels - 1);
+ _scale_nearest.resize(num_levels - 1);
PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index c50132e..d712a23 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -256,7 +256,7 @@
void CLGenerateProposalsLayer::run()
{
// Acquire all the temporaries
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Compute all the anchors
CLScheduler::get().enqueue(_compute_anchors_kernel, false);
@@ -277,8 +277,5 @@
// Add dummy batch indexes
CLScheduler::get().enqueue(_memset_kernel, true);
CLScheduler::get().enqueue(_padded_copy_kernel, true);
-
- // Release all the temporaries
- _memory_group.release();
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index 1470d5c..0931443 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -95,7 +95,7 @@
void CLHOGDescriptor::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run gradient
_gradient.run();
@@ -105,6 +105,4 @@
// Run block normalization
CLScheduler::get().enqueue(_block_norm);
-
- _memory_group.release();
}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index 51aeaed..e509fd8 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -71,13 +71,11 @@
void CLHOGGradient::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run derivative
_derivative.run();
// Run magnitude/phase kernel
CLScheduler::get().enqueue(_mag_phase);
-
- _memory_group.release();
}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index 8012c2f..f799d61 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -128,12 +128,11 @@
_num_block_norm_kernel = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
_num_hog_detect_kernel = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
- _orient_bin_kernel = arm_compute::support::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
- _block_norm_kernel = arm_compute::support::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
- _hog_detect_kernel = arm_compute::support::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
- _non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
- _hog_space = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
- _hog_norm_space = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
+ _orient_bin_kernel.resize(_num_orient_bin_kernel);
+ _block_norm_kernel.resize(_num_block_norm_kernel);
+ _hog_detect_kernel.resize(_num_hog_detect_kernel);
+ _hog_space.resize(_num_orient_bin_kernel);
+ _hog_norm_space.resize(_num_block_norm_kernel);
// Allocate tensors for magnitude and phase
TensorInfo info_mag(shape_img, Format::S16);
@@ -172,10 +171,10 @@
_hog_space[i].allocator()->init(info_space);
// Manage intermediate buffers
- _memory_group.manage(_hog_space.get() + i);
+ _memory_group.manage(&_hog_space[i]);
// Initialise orientation binning kernel
- _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+ _orient_bin_kernel[i].configure(&_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
}
// Allocate intermediate tensors
@@ -193,10 +192,10 @@
_hog_norm_space[i].allocator()->init(tensor_info);
// Manage intermediate buffers
- _memory_group.manage(_hog_norm_space.get() + i);
+ _memory_group.manage(&_hog_norm_space[i]);
// Initialize block normalization kernel
- _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+ _block_norm_kernel[i].configure(&_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
}
// Allocate intermediate tensors
@@ -212,13 +211,13 @@
{
const size_t idx_block_norm = input_hog_detect[i];
- _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+ _hog_detect_kernel[i].configure(&_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
}
detection_window_strides->unmap(CLScheduler::get().queue());
// Configure non maxima suppression kernel
- _non_maxima_kernel->configure(_detection_windows, min_distance);
+ _non_maxima_kernel.configure(_detection_windows, min_distance);
// Allocate intermediate tensors
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
@@ -231,7 +230,7 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Reset detection window
_detection_windows->clear();
@@ -242,13 +241,13 @@
// Run orientation binning kernel
for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
{
- CLScheduler::get().enqueue(*(_orient_bin_kernel.get() + i), false);
+ CLScheduler::get().enqueue(_orient_bin_kernel[i], false);
}
// Run block normalization kernel
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
{
- CLScheduler::get().enqueue(*(_block_norm_kernel.get() + i), false);
+ CLScheduler::get().enqueue(_block_norm_kernel[i], false);
}
// Run HOG detector kernel
@@ -262,9 +261,7 @@
{
// Map detection windows array before computing non maxima suppression
_detection_windows->map(CLScheduler::get().queue(), true);
- Scheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
+ Scheduler::get().schedule(&_non_maxima_kernel, Window::DimY);
_detection_windows->unmap(CLScheduler::get().queue());
}
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 65ce7de..67f550d3 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -55,7 +55,7 @@
_gy(),
_score(),
_nonmax(),
- _corners_list(nullptr),
+ _corners_list(),
_num_corner_candidates(0),
_corners(nullptr)
{
@@ -84,7 +84,7 @@
_score.allocator()->init(info_f32);
_nonmax.allocator()->init(info_f32);
- _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+ _corners_list.resize(shape.x() * shape.y());
// Manage intermediate buffers
_memory_group.manage(&_gx);
@@ -146,20 +146,20 @@
_score.allocator()->allocate();
// Init corner candidates kernel
- _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+ _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates);
// Allocate intermediate buffers
_nonmax.allocator()->allocate();
// Init euclidean distance
- _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
+ _sort_euclidean.configure(_corners_list.data(), _corners, &_num_corner_candidates, min_dist);
}
void CLHarrisCorners::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Init to 0 number of corner candidates
_num_corner_candidates = 0;
@@ -185,6 +185,4 @@
_corners->map(CLScheduler::get().queue(), true);
Scheduler::get().schedule(&_sort_euclidean, Window::DimY);
_corners->unmap(CLScheduler::get().queue());
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 2e3c6d7..136cb5e 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -74,11 +74,9 @@
void CLL2NormalizeLayer::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
_reduce_func.run();
CLScheduler::get().enqueue(_normalize_kernel, true);
-
- _memory_group.release();
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index f01b1b8..4606a66 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -43,10 +43,11 @@
_pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state(), _accum_cell_state1(), _accum_cell_state2(),
_pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(),
_accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(),
- _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _input_gate_out5(),
- _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(),
- _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false),
- _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false)
+ _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
+ _ones_memset_kernel(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(),
+ _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(),
+ _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false),
+ _is_prepared(false)
{
}
@@ -93,25 +94,38 @@
lstm_params_info, activation_info, cell_threshold, projection_threshold));
const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
-
// Configure block that calculates the forget gate
// forget_gate = Activation(input * input_to_forget_weights + output_state_in * recurrent_to_forget_weights + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
- TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ // We optimize this as follows:
+ // forget_gate = Activation( (input,output_state_in) * (input_to_forget_weights,recurrent_to_forget_weights) + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias
_forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
_forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_forget_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _memory_group.manage(&_forget_gate_out1);
- _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1);
+ std::vector<const ICLTensor *> inputs_vector;
+ inputs_vector.emplace_back(input);
+ inputs_vector.emplace_back(output_state_in);
+ const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+ _forget_gate_out2.allocator()->init(TensorInfo(concat_shape, 1, input->info()->data_type()));
+
_memory_group.manage(&_forget_gate_out2);
- _transpose_forget_gate.configure(recurrent_to_forget_weights, &_forget_gate_out2);
- _memory_group.manage(&_forget_gate_out3);
- _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
- _forget_gate_out2.allocator()->allocate();
+ _concat_inputs_forget_gate.configure(input, output_state_in, &_forget_gate_out2);
+
+ std::vector<const ICLTensor *> weights_vector;
+
+ weights_vector.emplace_back(input_to_forget_weights);
+ weights_vector.emplace_back(recurrent_to_forget_weights);
+ const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
+ _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
+
+ _concat_weights_forget_gate.configure(input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6);
+
_memory_group.manage(&_forget_gate_out5);
- _accum_forget_gate1.configure(ArithmeticOperation::ADD, &_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
- _forget_gate_out1.allocator()->allocate();
+ _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, forget_gate_bias, &_forget_gate_out5);
+ _memory_group.manage(&_forget_gate_out1);
+ _memory_group.manage(&_forget_gate_out3);
+ _forget_gate_out6.allocator()->allocate();
+
CLTensor *forget_gate_out = &_forget_gate_out5;
if(lstm_params.has_peephole_opt())
{
@@ -134,43 +148,46 @@
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
// input_gate = 1 - forget_gate, with CIFG
+ // We optimize this as follows:
+ // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
_input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
CLTensor *input_gate_out = &_input_gate_out1;
if(lstm_params.has_cifg_opt())
{
_memory_group.manage(&_input_gate_out1);
_ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _ones_memset_kernel.configure(&_ones, PixelValue(1, _ones.info()->data_type()));
_subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
_ones.allocator()->allocate();
_run_cifg_opt = true;
}
else
{
- TensorShape input_gate_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
-
- _input_gate_out2.allocator()->init(TensorInfo(input_gate_shape, 1, input->info()->data_type()));
_input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_input_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _input_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ std::vector<const ICLTensor *> lstm_weights;
+ lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+ lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+ TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+ _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
+
+ _concat_weights_input_gate.configure(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
_memory_group.manage(&_input_gate_out1);
- _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1);
- _memory_group.manage(&_input_gate_out2);
- _transpose_input_gate.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+
_memory_group.manage(&_input_gate_out3);
- _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
+ _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, lstm_params.input_gate_bias(), &_input_gate_out3);
_input_gate_out2.allocator()->allocate();
- _memory_group.manage(&_input_gate_out4);
- _accum_input_gate1.configure(ArithmeticOperation::ADD, &_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
- _input_gate_out3.allocator()->allocate();
- input_gate_out = &_input_gate_out4;
+
+ input_gate_out = &_input_gate_out3;
if(_run_peephole_opt)
{
- _memory_group.manage(&_input_gate_out5);
- _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _memory_group.manage(&_input_gate_out4);
+ _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_input_gate2.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _input_gate_out3.allocator()->allocate();
_input_gate_out4.allocator()->allocate();
- _input_gate_out5.allocator()->allocate();
input_gate_out = &_input_gate_out1;
}
else
@@ -215,35 +232,39 @@
// Configure block that calculates the output
// output_state_out = Activation(input * input_to_output_weights + output_state_in * recurrent_to_output_weights + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
- TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ // We optimize this as follows:
+ // output_state_out = Activation( (input,output_state_in) * (input_to_output_weights, recurrent_to_output_weights) + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
_output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
- _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _output5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _output4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ std::vector<const ICLTensor *> in_out_weights;
+ in_out_weights.emplace_back(input_to_output_weights);
+ in_out_weights.emplace_back(recurrent_to_output_weights);
+ TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+ _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
+
+ _concat_weights_output.configure(input_to_output_weights, recurrent_to_output_weights, &_output2);
_memory_group.manage(&_output1);
- _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1);
- _memory_group.manage(&_output2);
- _transpose_output.configure(recurrent_to_output_weights, &_output2);
- _memory_group.manage(&_output3);
- _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
+ _memory_group.manage(&_output4);
+
+ _fully_connected_output.configure(&_forget_gate_out2, &_output2, output_gate_bias, &_output4);
+
_output2.allocator()->allocate();
- _memory_group.manage(&_output5);
- _accum_output1.configure(ArithmeticOperation::ADD, &_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
- _output3.allocator()->allocate();
- CLTensor *output_gate_out = &_output5;
+ _forget_gate_out2.allocator()->allocate();
+
+ CLTensor *output_gate_out = &_output4;
if(lstm_params.has_peephole_opt())
{
- _output4.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
+ _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
- _memory_group.manage(&_output4);
- _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_output2.configure(&_output5, &_output4, &_output1, ConvertPolicy::SATURATE);
- _output5.allocator()->allocate();
+ _memory_group.manage(&_output3);
+ _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_output2.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
+ _output4.allocator()->allocate();
output_gate_out = &_output1;
// Allocate intermediate buffers
- _output4.allocator()->allocate();
+ _output3.allocator()->allocate();
}
else
{
@@ -369,8 +390,15 @@
// Validate forget gate
ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+
+ std::vector<const ITensorInfo *> inputs_vector;
+ inputs_vector.emplace_back(input);
+ inputs_vector.emplace_back(output_state_in);
+ const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+ TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input, output_state_in, &forget_gate_concat));
+
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
@@ -388,9 +416,15 @@
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
+ std::vector<const ITensorInfo *> lstm_weights;
+ lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+ lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+ TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+ TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &lstm_gate_concat));
+
ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &input_gate, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
@@ -419,10 +453,15 @@
cell_threshold)));
}
+ std::vector<const ITensorInfo *> in_out_weights;
+ in_out_weights.emplace_back(input_to_output_weights);
+ in_out_weights.emplace_back(recurrent_to_output_weights);
+ TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+ TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input_to_output_weights, recurrent_to_output_weights, &in_out_gate_concat));
// Validate output gate tmp
ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &output_gate_tmp, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
@@ -464,12 +503,13 @@
void CLLSTMLayer::run()
{
- _memory_group.acquire();
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ CLScheduler::get().enqueue(_concat_inputs_forget_gate);
_fully_connected_forget_gate.run();
- CLScheduler::get().enqueue(_transpose_forget_gate);
- _gemm_forget_gate.run();
- CLScheduler::get().enqueue(_accum_forget_gate1);
if(_run_peephole_opt)
{
@@ -480,24 +520,13 @@
if(_run_cifg_opt)
{
- _ones.map(true);
- if(_ones.info()->data_type() == DataType::F16)
- {
- std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
- }
- else
- {
- std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
- }
- _ones.unmap();
+ CLScheduler::get().enqueue(_ones_memset_kernel);
CLScheduler::get().enqueue(_subtract_input_gate);
}
else
{
_fully_connected_input_gate.run();
- CLScheduler::get().enqueue(_transpose_input_gate);
- _gemm_input_gate.run();
- CLScheduler::get().enqueue(_accum_input_gate1);
+
if(_run_peephole_opt)
{
CLScheduler::get().enqueue(_pixelwise_mul_input_gate);
@@ -521,9 +550,6 @@
}
_fully_connected_output.run();
- CLScheduler::get().enqueue(_transpose_output);
- _gemm_output.run();
- CLScheduler::get().enqueue(_accum_output1);
if(_run_peephole_opt)
{
@@ -548,6 +574,18 @@
CLScheduler::get().enqueue(_copy_output);
_concat_scratch_buffer.run();
+}
- _memory_group.release();
+void CLLSTMLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ CLScheduler::get().enqueue(_concat_weights_forget_gate);
+ if(!_run_cifg_opt)
+ {
+ CLScheduler::get().enqueue(_concat_weights_input_gate);
+ }
+ CLScheduler::get().enqueue(_concat_weights_output);
+ _is_prepared = true;
+ }
}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index 559b57f..a118518 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,8 +70,8 @@
// Create Gaussian Pyramid function
_gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
- _convf = arm_compute::support::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
- _subf = arm_compute::support::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
+ _convf.resize(_num_levels);
+ _subf.resize(_num_levels);
for(unsigned int i = 0; i < _num_levels; ++i)
{
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
index 911c9b3..13116bf 100644
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -63,8 +63,8 @@
_tmp_pyr.init(pyramid_info);
// Allocate add and scale functions. Level 0 does not need to be scaled.
- _addf = arm_compute::support::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
- _scalef = arm_compute::support::cpp14::make_unique<CLScale[]>(num_levels - 1);
+ _addf.resize(num_levels);
+ _scalef.resize(num_levels - 1);
const size_t last_level = num_levels - 1;
@@ -85,7 +85,7 @@
void CLLaplacianReconstruct::run()
{
- ARM_COMPUTE_ERROR_ON_MSG(_addf == nullptr, "Unconfigured function");
+ ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function");
const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 5c6bef9..3e99dde 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -176,7 +176,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run input reshaping
CLScheduler::get().enqueue(_input_im2col_kernel);
@@ -186,8 +186,6 @@
// Reshape output matrix
CLScheduler::get().enqueue(_output_col2im_kernel, false);
-
- _memory_group.release();
}
void CLLocallyConnectedLayer::prepare()
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
index 157f306..8517b59 100644
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -104,7 +104,7 @@
template <typename T>
void CLMeanStdDev::run_float()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Perform reduction on x-axis
_reduction_operation_mean.run();
@@ -140,8 +140,6 @@
_reduction_output_stddev.unmap();
}
_reduction_output_mean.unmap();
-
- _memory_group.release();
}
void CLMeanStdDev::run_int()
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index d00b1b5..a013a1f 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -84,12 +84,12 @@
const int old_values_list_length = list_length * window_dimension * window_dimension;
// Create kernels and tensors
- _tracker_init_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
- _tracker_stage0_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
- _tracker_stage1_kernel = arm_compute::support::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
- _func_scharr = arm_compute::support::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
- _scharr_gx = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
- _scharr_gy = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_levels);
+ _tracker_init_kernel.resize(_num_levels);
+ _tracker_stage0_kernel.resize(_num_levels);
+ _tracker_stage1_kernel.resize(_num_levels);
+ _func_scharr.resize(_num_levels);
+ _scharr_gx.resize(_num_levels);
+ _scharr_gy.resize(_num_levels);
// Create internal keypoint arrays
_old_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
@@ -118,8 +118,8 @@
_scharr_gy[i].allocator()->init(tensor_info);
// Manage intermediate buffers
- _memory_group.manage(_scharr_gx.get() + i);
- _memory_group.manage(_scharr_gy.get() + i);
+ _memory_group.manage(&_scharr_gx[i]);
+ _memory_group.manage(&_scharr_gy[i]);
// Init Scharr kernel
_func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
@@ -149,7 +149,7 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
for(unsigned int level = _num_levels; level > 0; --level)
{
@@ -167,6 +167,4 @@
}
CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 3aa1b1e..99e3121 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -25,39 +25,293 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "support/ToolchainSupport.h"
namespace arm_compute
{
CLPadLayer::CLPadLayer()
- : _copy_kernel(), _fillborder_kernel(), _memset_kernel()
+ : _copy_kernel(), _mode(), _padding(), _memset_kernel(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
{
}
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value)
+void CLPadLayer::configure_constant_mode(ICLTensor *input, ICLTensor *output, const PaddingList &padding, const PixelValue constant_value)
{
- // Copy the input to the output
- _copy_kernel.configure(input, output, padding);
-
- // Set the pages of the output to zero
+ // Set the pages of the output to the constant_value.
_memset_kernel.configure(output, constant_value);
- // Fill padding on the first two dimensions with zeros
- _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT, constant_value);
+ // Fill out padding list with zeroes.
+ PaddingList padding_extended = padding;
+ for(size_t i = padding.size(); i < TensorShape::num_max_dimensions; i++)
+ {
+ padding_extended.emplace_back(PaddingInfo{ 0, 0 });
+ }
+
+ // Create a window within the output tensor where the input will be copied.
+ Window copy_window = Window();
+ for(uint32_t i = 0; i < output->info()->num_dimensions(); ++i)
+ {
+ copy_window.set(i, Window::Dimension(padding_extended[i].first, padding_extended[i].first + input->info()->dimension(i), 1));
+ }
+ // Copy the input to the output, leaving the padding filled with the constant_value.
+ _copy_kernel.configure(input, output, PaddingList(), ©_window);
}
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
+void CLPadLayer::configure_reflect_symmetric_mode(ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, constant_value));
- ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, padding));
+ int64_t last_padding_dimension = _padding.size() - 1;
+ // Reflecting can be performed by effectively unfolding the input as follows:
+ // For each dimension starting at DimX:
+ // Create a before and after slice, which values depend on the selected padding mode
+ // Concatenate the before and after padding with the tensor to be padded
+ // Two strided slice functions will be required for each dimension padded as well as a
+ // concatenate function and the tensors to hold the temporary results.
+ _slice_functions.resize(2 * _num_dimensions);
+ _slice_results.resize(2 * _num_dimensions);
+ _concat_functions.resize(_num_dimensions);
+ _concat_results.resize(_num_dimensions - 1);
+
+ Coordinates starts_before{};
+ Coordinates ends_before{};
+ Coordinates starts_after{};
+ Coordinates ends_after{};
+ Coordinates strides{};
+ ICLTensor *prev = input;
+ for(uint32_t i = 0; i < _num_dimensions; ++i)
+ {
+ // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
+ if(i > 0)
+ {
+ strides.set(i - 1, 1);
+ }
+
+ if(_padding[i].first > 0 || _padding[i].second > 0)
+ {
+ // Set the starts, ends, and strides values for the current dimension.
+ // Due to the bit masks passed to strided slice, the values below the current dimension in
+ // starts and ends will be ignored so do not need to be modified.
+ if(_mode == PaddingMode::REFLECT)
+ {
+ starts_before.set(i, _padding[i].first);
+ ends_before.set(i, 0);
+ starts_after.set(i, input->info()->dimension(i) - 2);
+ ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 2);
+ strides.set(i, -1);
+ }
+ else
+ {
+ starts_before.set(i, _padding[i].first - 1);
+ ends_before.set(i, -1);
+ starts_after.set(i, input->info()->dimension(i) - 1);
+ ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 1);
+ strides.set(i, -1);
+ }
+
+ // Strided slice wraps negative indexes around to the end of the range,
+ // instead this should indicate use of the full range and so the bit mask will be modified.
+ const int32_t begin_mask_before = starts_before[i] < 0 ? ~0 : ~(1u << i);
+ const int32_t end_mask_before = ends_before[i] < 0 ? ~0 : ~(1u << i);
+ const int32_t begin_mask_after = starts_after[i] < 0 ? ~0 : ~(1u << i);
+ const int32_t end_mask_after = ends_after[i] < 0 ? ~0 : ~(1u << i);
+
+ // Reflect the input values for the padding before and after the input.
+ std::vector<ICLTensor *> concat_vector;
+ if(_padding[i].first > 0)
+ {
+ if(i < prev->info()->num_dimensions())
+ {
+ _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
+ concat_vector.push_back(&_slice_results[2 * i]);
+ }
+ else
+ {
+ // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+ concat_vector.push_back(prev);
+ }
+ }
+ concat_vector.push_back(prev);
+ if(_padding[i].second > 0)
+ {
+ if(i < prev->info()->num_dimensions())
+ {
+ _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
+ concat_vector.push_back(&_slice_results[2 * i + 1]);
+ }
+ else
+ {
+ // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+ concat_vector.push_back(prev);
+ }
+ }
+ // Concatenate the padding before and after with the input.
+ ICLTensor *out = (static_cast<int32_t>(i) == last_padding_dimension) ? output : &_concat_results[i];
+ _concat_functions[i].configure(concat_vector, out, i);
+ prev = out;
+ }
+ }
+ for(uint32_t i = 0; i < _num_dimensions; ++i)
+ {
+ if((static_cast<int32_t>(i) != last_padding_dimension))
+ {
+ _concat_results[i].allocator()->allocate();
+ }
+ _slice_results[2 * i].allocator()->allocate();
+ _slice_results[2 * i + 1].allocator()->allocate();
+ }
+}
+
+void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
+
+ _padding = padding;
+ _mode = mode;
+
+ TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
+
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
+
+ // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
+ int64_t last_padding_dimension = _padding.size() - 1;
+ for(; last_padding_dimension >= 0; --last_padding_dimension)
+ {
+ if(_padding[last_padding_dimension].first > 0 || _padding[last_padding_dimension].second > 0)
+ {
+ break;
+ }
+ }
+ _num_dimensions = last_padding_dimension + 1;
+ if(_num_dimensions > 0)
+ {
+ switch(_mode)
+ {
+ case PaddingMode::CONSTANT:
+ {
+ configure_constant_mode(input, output, padding, constant_value);
+ break;
+ }
+ case PaddingMode::REFLECT:
+ case PaddingMode::SYMMETRIC:
+ {
+ configure_reflect_symmetric_mode(input, output);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Padding mode not supported.");
+ }
+ }
+ else
+ {
+ // Copy the input to the whole output if no padding is applied
+ _copy_kernel.configure(input, output);
+ }
+}
+
+Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions());
+
+ TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
+
+ // Use CLCopyKernel and CLMemsetKernel to validate all padding modes as this includes all of the shape and info validation.
+ PaddingList padding_extended = padding;
+ for(size_t i = padding.size(); i < TensorShape::num_max_dimensions; i++)
+ {
+ padding_extended.emplace_back(PaddingInfo{ 0, 0 });
+ }
+
+ Window copy_window = Window();
+ for(uint32_t i = 0; i < padded_shape.num_dimensions(); ++i)
+ {
+ copy_window.set(i, Window::Dimension(padding_extended[i].first, padding_extended[i].first + input->dimension(i), 1));
+ }
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, PaddingList(), ©_window));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, constant_value));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, &input->clone()->set_tensor_shape(padded_shape), PaddingList(), ©_window));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(&input->clone()->set_tensor_shape(padded_shape), constant_value));
+ }
+
+ switch(mode)
+ {
+ case PaddingMode::CONSTANT:
+ {
+ break;
+ }
+ case PaddingMode::REFLECT:
+ case PaddingMode::SYMMETRIC:
+ {
+ for(uint32_t i = 0; i < padding.size(); ++i)
+ {
+ if(mode == PaddingMode::REFLECT)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first > input->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second > input->dimension(i));
+ }
+ }
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Invalid mode");
+ }
+ }
return Status{};
}
void CLPadLayer::run()
{
- CLScheduler::get().enqueue(_memset_kernel, false);
- CLScheduler::get().enqueue(_fillborder_kernel, false);
- CLScheduler::get().enqueue(_copy_kernel, true);
+ if(_num_dimensions > 0)
+ {
+ switch(_mode)
+ {
+ case PaddingMode::CONSTANT:
+ {
+ CLScheduler::get().enqueue(_memset_kernel, false);
+ CLScheduler::get().enqueue(_copy_kernel, true);
+ break;
+ }
+ case PaddingMode::REFLECT:
+ case PaddingMode::SYMMETRIC:
+ {
+ for(uint32_t i = 0; i < _num_dimensions; ++i)
+ {
+ if(_padding[i].first > 0 || _padding[i].second > 0)
+ {
+ if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
+ {
+ _slice_functions[2 * i].run();
+ }
+ if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
+ {
+ _slice_functions[2 * i + 1].run();
+ }
+ CLScheduler::get().sync();
+ _concat_functions[i].run();
+ CLScheduler::get().sync();
+ }
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Padding mode not supported.");
+ }
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_copy_kernel, true);
+ }
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index b4c20db..959464c 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,8 +29,8 @@
#include <utility>
-using namespace arm_compute;
-
+namespace arm_compute
+{
void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
@@ -54,3 +54,26 @@
{
return CLPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
}
+
+void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLComplexPixelWiseMultiplicationKernel>();
+ k->configure(input1, input2, output);
+ _kernel = std::move(k);
+
+ if(output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
+
+Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index a13859c..df10e1e 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,54 +21,22 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
+#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
-CLQuantizationLayer::CLQuantizationLayer()
- : _quantize_kernel(), _min_max_kernel(), _min_max()
+namespace arm_compute
{
+void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLQuantizationLayerKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
}
Status CLQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
- TensorInfo min_max{ input->num_channels(), input->data_type() };
- ARM_COMPUTE_RETURN_ON_ERROR(CLMinMaxLayerKernel::validate(input, &min_max));
- ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayerKernel::validate(input, output, &min_max));
-
- return Status{};
+ return CLQuantizationLayerKernel::validate(input, output);
}
-
-void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel.
- _min_max_kernel.configure(input, &_min_max);
-
- // Configure quantize kernel
- _quantize_kernel.configure(input, output, &_min_max);
-
- // Allocate min_max tensor
- _min_max.allocator()->allocate();
-}
-
-void CLQuantizationLayer::run()
-{
- cl::CommandQueue q = CLScheduler::get().queue();
-
- // Reset min and max
- _min_max_kernel.reset(q);
-
- // Run min-max kernel
- CLScheduler::get().enqueue(_min_max_kernel, false);
-
- // Run quantize kernel
- CLScheduler::get().enqueue(_quantize_kernel, false);
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 63f00ac..19eb69f 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -105,7 +105,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
_fully_connected_kernel.run();
_gemm_state_f.run();
@@ -114,8 +114,6 @@
// copy hidden out to output
CLScheduler::get().enqueue(_copy_kernel);
-
- _memory_group.release();
}
void CLRNNLayer::prepare()
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index b2d0f81..a3634cd 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/CL/functions/CLReduceMean.h"
+#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
#include "arm_compute/core/Types.h"
@@ -40,10 +41,10 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
- _reduction_ops = reduction_axis.num_dimensions();
- _reduction_kernels = arm_compute::support::cpp14::make_unique<CLReductionOperation[]>(_reduction_ops);
- _reduced_outs = arm_compute::support::cpp14::make_unique<CLTensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
- _keep_dims = keep_dims;
+ _reduction_ops = reduction_axis.num_dimensions();
+ _reduction_kernels.resize(_reduction_ops);
+ _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+ _keep_dims = keep_dims;
Coordinates axis_local = reduction_axis;
const int input_dims = input->info()->num_dimensions();
@@ -57,9 +58,9 @@
// Perform reduction for every axis
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
- TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+ TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
- auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+ auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
if(i == _reduction_ops - 1 && keep_dims)
{
@@ -68,8 +69,8 @@
else
{
_reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
- _memory_group.manage(_reduced_outs.get() + i);
- _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
+ _memory_group.manage(&_reduced_outs[i]);
+ _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
}
}
@@ -92,13 +93,15 @@
out_shape.remove_dimension(axis_local[i] - i);
}
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
- _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+ _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
}
}
Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
TensorShape out_shape = input->tensor_shape();
@@ -140,7 +143,7 @@
void CLReduceMean::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
@@ -151,6 +154,5 @@
{
_reshape.run();
}
- _memory_group.release();
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 3d82e3f..9f99d2d 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -71,7 +71,7 @@
else
{
// Create temporary tensor infos
- auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
+ std::vector<TensorInfo> sums_vector(num_of_stages - 1);
// Create intermediate tensor info
TensorShape shape{ input->tensor_shape() };
@@ -110,17 +110,17 @@
}
// Validate ReductionOperation only on first kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, first_kernel_op));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, &sums_vector[0], axis, first_kernel_op));
// Validate ReductionOperation on intermediate stages
for(unsigned int i = 1; i < num_of_stages - 1; ++i)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[i - 1], &sums_vector[i], axis, intermediate_kernel_op));
}
// Validate ReductionOperation on the last stage
const unsigned int last_stage = num_of_stages - 1;
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output, axis, last_kernel_op, input->dimension(0)));
}
return Status{};
@@ -133,7 +133,7 @@
_is_serial = is_data_type_quantized(input->info()->data_type()) || axis != 0;
// Configure reduction operation kernels
- _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
+ _reduction_kernels_vector.resize(_num_of_stages);
// Create temporary tensors
if(_is_serial)
@@ -142,8 +142,8 @@
}
else
{
- _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
- _results_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+ _border_handlers_vector.resize(_num_of_stages);
+ _results_vector.resize(_num_of_stages - 1);
TensorShape shape{ input->info()->tensor_shape() };
for(unsigned int i = 0; i < _num_of_stages - 1; i++)
{
@@ -152,7 +152,7 @@
}
// Apply ReductionOperation only on first kernel
- _memory_group.manage(_results_vector.get());
+ _memory_group.manage(&_results_vector[0]);
ReductionOperation first_kernel_op;
ReductionOperation intermediate_kernel_op;
@@ -183,30 +183,30 @@
ARM_COMPUTE_ERROR("Not supported");
}
- _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op);
+ _reduction_kernels_vector[0].configure(input, &_results_vector[0], axis, first_kernel_op);
_border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
// Apply ReductionOperation on intermediate stages
for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
{
- _memory_group.manage(_results_vector.get() + i);
- _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op);
- _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+ _memory_group.manage(&_results_vector[i]);
+ _reduction_kernels_vector[i].configure(&_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
+ _border_handlers_vector[i].configure(&_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
_results_vector[i - 1].allocator()->allocate();
}
// Apply ReductionOperation on the last stage
const unsigned int last_stage = _num_of_stages - 1;
const unsigned int input_width = input->info()->dimension(0);
- _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
- _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+ _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output, axis, last_kernel_op, input_width);
+ _border_handlers_vector[last_stage].configure(&_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
_results_vector[last_stage - 1].allocator()->allocate();
}
}
void CLReductionOperation::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_is_serial)
{
@@ -220,6 +220,4 @@
CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
}
}
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
index d4bc855..22fbef1 100644
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,10 +81,8 @@
{
CLScheduler::get().enqueue(_border_handler, false);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
CLScheduler::get().enqueue(_sobel_hor, false);
CLScheduler::get().enqueue(_sobel_vert);
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
index 6083090..9b38f69 100644
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,10 +81,8 @@
{
CLScheduler::get().enqueue(_border_handler, false);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
CLScheduler::get().enqueue(_sobel_hor, false);
CLScheduler::get().enqueue(_sobel_vert);
-
- _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index d671846..7e41dba 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -191,7 +191,7 @@
void CLSoftmaxLayer::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_needs_flattening)
{
@@ -205,9 +205,6 @@
{
CLScheduler::get().enqueue(_reshape_kernel, true);
}
-
- // Relase intermediate buffers
- _memory_group.release();
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index f084351..8d37d53 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,8 +42,8 @@
void CLSplit::configure(const ICLTensor *input, const std::vector<ICLTensor *> &outputs, unsigned int axis)
{
// Create Slice functions
- _num_outputs = outputs.size();
- _slice_functions = arm_compute::support::cpp14::make_unique<CLSlice[]>(_num_outputs);
+ _num_outputs = outputs.size();
+ _slice_functions.resize(_num_outputs);
// Get output shape
const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs);
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 71327fe..2700b49 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -46,8 +46,8 @@
void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
{
- _num_inputs = input.size();
- _stack_kernels = arm_compute::support::cpp14::make_unique<CLStackLayerKernel[]>(_num_inputs);
+ _num_inputs = input.size();
+ _stack_kernels.resize(_num_inputs);
// Wrap around negative values
const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
index 428d091..eb1dd8c 100644
--- a/src/runtime/CL/functions/CLUnstack.cpp
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -74,7 +74,7 @@
// Wrap around negative values
const unsigned int axis_u = wrap_axis(axis, input->info());
_num_slices = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
- _strided_slice_vector = arm_compute::support::cpp14::make_unique<CLStridedSlice[]>(_num_slices);
+ _strided_slice_vector.resize(_num_slices);
Coordinates slice_start;
int32_t slice_end_mask;
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index d0801a6..a8667c3 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
@@ -51,7 +51,7 @@
// Output auto inizialitation if not yet initialized
TensorInfo tmp_output_info = *output->clone();
- const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
switch(num_inputs)
@@ -90,7 +90,7 @@
{
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
}
- const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
@@ -109,7 +109,7 @@
break;
default:
// Configure generic case WidthConcatenate kernels
- _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
+ _concat_kernels_vector.resize(_num_inputs);
unsigned int width_offset = 0;
for(unsigned int i = 0; i < _num_inputs; ++i)
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 069196e..d3c3f98 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,6 +62,11 @@
output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
kernel_dims.height == 1 ? 1U : 4U);
}
+ else if(kernel_max_dim == 7U)
+ {
+ output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U,
+ kernel_dims.height == 1 ? 1U : 2U);
+ }
return output_tile;
}
@@ -73,7 +78,8 @@
std::vector<WinogradConfiguration> fast_math_winograd =
{
- WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+ WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
+ WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))
};
auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
@@ -198,7 +204,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run input transform
_input_transform.run();
@@ -208,8 +214,6 @@
// Run output transform
CLScheduler::get().enqueue(_output_transform);
-
- _memory_group.release();
}
void CLWinogradConvolutionLayer::prepare()