arm_compute v19.05
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index d33e134..6863bb0 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -57,15 +57,13 @@
void NEArgMinMaxLayer::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_run_fill_border)
{
NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
}
NEScheduler::get().schedule(&_reduction_kernel, Window::DimY);
-
- _memory_group.release();
}
} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
new file mode 100644
index 0000000..a4db1fd
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_shape, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEBatchToSpaceLayerKernel>();
+ k->configure(input, block_shape, output);
+ _kernel = std::move(k);
+}
+
+void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEBatchToSpaceLayerKernel>();
+ k->configure(input, block_shape_x, block_shape_y, output);
+ _kernel = std::move(k);
+}
+
+Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+{
+ return NEBatchToSpaceLayerKernel::validate(input, block_shape, output);
+}
+
+Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output)
+{
+ return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 0e5d50f..032e617 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -162,7 +162,7 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run sobelNxN
_sobel->run();
@@ -184,6 +184,4 @@
// Run edge tracing
NEScheduler::get().schedule(&_edge_trace, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index 21ab47d..71af560 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,9 @@
#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
#include "arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "arm_compute/core/Error.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
@@ -35,56 +38,111 @@
namespace arm_compute
{
NEConcatenateLayer::NEConcatenateLayer()
- : _concat_function(nullptr)
+ : _concat_kernels(),
+ _num_inputs(0),
+ _axis(Window::DimX)
{
}
-void NEConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output, DataLayoutDimension axis)
+void NEConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output, size_t axis)
{
ARM_COMPUTE_ERROR_ON(output == nullptr);
+ _axis = axis;
+ _num_inputs = inputs_vector.size();
- switch(get_data_layout_dimension_index(output->info()->data_layout(), axis))
+ std::vector<ITensorInfo *> inputs_vector_info;
+ inputs_vector_info.reserve(_num_inputs);
+ for(unsigned int i = 0; i < _num_inputs; ++i)
{
- case 0:
+ ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
+ inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+ }
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+ ARM_COMPUTE_ERROR_THROW_ON(NEConcatenateLayer::validate(inputs_vector_info, output->info(), axis));
+
+ unsigned int offset = 0;
+
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ switch(_axis)
{
- auto func = support::cpp14::make_unique<NEWidthConcatenateLayer>();
- func->configure(inputs_vector, output);
- _concat_function = std::move(func);
- break;
+ case Window::DimX:
+ {
+ auto kernel = support::cpp14::make_unique<NEWidthConcatenateLayerKernel>();
+ kernel->configure(inputs_vector.at(i), offset, output);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ case Window::DimY:
+ {
+ auto kernel = support::cpp14::make_unique<NEHeightConcatenateLayerKernel>();
+ kernel->configure(inputs_vector.at(i), offset, output);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ case Window::DimZ:
+ {
+ auto kernel = support::cpp14::make_unique<NEDepthConcatenateLayerKernel>();
+ kernel->configure(inputs_vector.at(i), offset, output);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Axis not supported");
}
- case 2:
- {
- auto func = support::cpp14::make_unique<NEDepthConcatenateLayer>();
- func->configure(inputs_vector, output);
- _concat_function = std::move(func);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Concatenation is supported across width and depth only!");
+ offset += inputs_vector.at(i)->info()->dimension(_axis);
}
}
-Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, DataLayoutDimension axis)
+Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
{
- ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
- switch(get_data_layout_dimension_index(output->data_layout(), axis))
+ unsigned int offset = 0;
+ for(const auto &input : inputs_vector)
{
- case 0:
- ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector, output));
- break;
- case 2:
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayer::validate(inputs_vector, output));
- break;
- default:
- ARM_COMPUTE_RETURN_ERROR_MSG("Concatenation is supported across width and depth only!");
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ switch(axis)
+ {
+ case Window::DimX:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayerKernel::validate(input, offset, output));
+ break;
+ }
+ case Window::DimY:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEHeightConcatenateLayerKernel::validate(input, offset, output));
+ break;
+ }
+ case Window::DimZ:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayerKernel::validate(input, offset, output));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Axis not supported");
+ }
+ offset += input->dimension(axis);
}
+
+ if(output->total_size() != 0)
+ {
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+ }
+
return Status{};
}
void NEConcatenateLayer::run()
{
- ARM_COMPUTE_ERROR_ON(_concat_function == nullptr);
- _concat_function->run();
+ for(auto &kernel : _concat_kernels)
+ {
+ NEScheduler::get().schedule(kernel.get(), _axis);
+ }
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index b84dfd3..973855e 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -103,12 +103,10 @@
if(_is_separable)
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
-
- _memory_group.release();
}
else
{
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 5059162..a62459b 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -73,6 +73,13 @@
_function = std::move(f);
break;
}
+ case ConvolutionMethod::FFT:
+ {
+ auto f = arm_compute::support::cpp14::make_unique<NEFFTConvolutionLayer>(_memory_manager);
+ f->configure(input, weights, biases, output, conv_info, act_info);
+ _function = std::move(f);
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
@@ -97,6 +104,10 @@
case ConvolutionMethod::DIRECT:
//Validate Gemm-based Convolution
ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
+ case ConvolutionMethod::FFT:
+ // Validate FFT-based convolution layer
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
+ break;
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
@@ -148,12 +159,22 @@
return (*found).second;
}
- if(dilation != Size2D(1U, 1U) || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
+ if(dilation != Size2D(1U, 1U))
{
return ConvolutionMethod::GEMM;
}
-
- return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+ else
+ {
+ if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
+ {
+ return ConvolutionMethod::FFT;
+ }
+ if(input->dimension(idx_c) < 16)
+ {
+ return ConvolutionMethod::GEMM;
+ }
+ return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+ }
}
void NEConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
new file mode 100644
index 0000000..cc39d02
--- /dev/null
+++ b/src/runtime/NEON/functions/NECropResize.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "arm_compute/runtime/NEON/functions/NECropResize.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+NECropResize::NECropResize()
+ : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results()
+{
+}
+
+Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
+ Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
+ TensorInfo temp_info;
+ ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value));
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ TensorShape out_shape(input->tensor_shape()[0], crop_size.x, crop_size.y, boxes->tensor_shape()[1]);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), out_shape);
+ }
+ return Status{};
+}
+
+void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
+ InterpolationPolicy method, float extrapolation_value)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+
+ _num_boxes = boxes->info()->tensor_shape()[1];
+ TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y);
+
+ _output = output;
+ _method = method;
+ _extrapolation_value = extrapolation_value;
+
+ // For each crop box:
+ // - A crop kernel is used to extract the initial cropped image as specified by boxes[i] from the 3D image input[box_ind[i]].
+ // - A tensor is required to hold this initial cropped image.
+ // - A scale function is used to resize the cropped image to the size specified by crop_size.
+ // - A tensor is required to hold the final scaled image before it is copied into the 4D output
+ // that will hold all final cropped and scaled 3D images.
+ _crop.reserve(_num_boxes);
+ _crop_results.reserve(_num_boxes);
+ _scaled_results.reserve(_num_boxes);
+ _scale.reserve(_num_boxes);
+
+ for(unsigned int i = 0; i < _num_boxes; ++i)
+ {
+ auto crop_tensor = support::cpp14::make_unique<Tensor>();
+ TensorInfo crop_result_info(1, DataType::F32);
+ crop_result_info.set_data_layout(DataLayout::NHWC);
+ crop_tensor->allocator()->init(crop_result_info);
+
+ auto scale_tensor = support::cpp14::make_unique<Tensor>();
+ TensorInfo scaled_result_info(out_shape, 1, DataType::F32);
+ scaled_result_info.set_data_layout(DataLayout::NHWC);
+ scale_tensor->allocator()->init(scaled_result_info);
+
+ auto crop_kernel = support::cpp14::make_unique<NECropKernel>();
+ auto scale_kernel = support::cpp14::make_unique<NEScale>();
+ crop_kernel->configure(input, boxes, box_ind, crop_tensor.get(), i, _extrapolation_value);
+
+ _crop.emplace_back(std::move(crop_kernel));
+ _scaled_results.emplace_back(std::move(scale_tensor));
+ _crop_results.emplace_back(std::move(crop_tensor));
+ _scale.emplace_back(std::move(scale_kernel));
+ }
+}
+
+void NECropResize::run()
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
+
+ for(unsigned int i = 0; i < _num_boxes; ++i)
+ {
+ // Size of the crop box in _boxes and thus the shape of _crop_results[i]
+ // may not be known until run-time and so the kernels cannot be configured until then.
+ _crop[i]->configure_output_shape();
+ _crop_results[i]->allocator()->allocate();
+ NEScheduler::get().schedule(_crop[i].get(), Window::DimZ);
+
+ // Scale the cropped image.
+ _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false);
+ _scaled_results[i]->allocator()->allocate();
+ _scale[i]->run();
+
+ // Copy scaled image into output.
+ std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i)));
+ }
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 44d7197..aff335e 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -51,8 +51,8 @@
unsigned int inner_border_right, unsigned int inner_border_top)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32, DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
@@ -68,7 +68,11 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- if(bias != nullptr)
+ if(is_data_type_quantized_asymmetric(input->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ }
+ else
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
}
@@ -111,10 +115,11 @@
_inner_border = std::make_pair(inner_border_right, inner_border_top);
_is_prepared = false;
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
+ const DataLayout data_layout = input->info()->data_layout();
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
- _weights_flipped.allocator()->init(TensorInfo(weights->info()->tensor_shape(), 1, weights->info()->data_type()));
+ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
_flip_weights.configure(weights, &_weights_flipped);
auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
@@ -159,12 +164,10 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
_upsample_f.run();
_conv_f.run();
-
- _memory_group.release();
}
void NEDeconvolutionLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
index 49db855..8f070a2 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,26 +45,30 @@
void NEDepthConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output) // NOLINT
{
- _num_inputs = inputs_vector.size();
- _concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEDepthConcatenateLayerKernel[]>(_num_inputs);
- _border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+ _num_inputs = inputs_vector.size();
std::vector<ITensorInfo *> inputs_vector_info;
for(unsigned int i = 0; i < _num_inputs; i++)
{
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
}
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector_info);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector_info, Window::DimZ);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
ARM_COMPUTE_ERROR_THROW_ON(NEDepthConcatenateLayer::validate(inputs_vector_info, output->info()));
unsigned int depth_offset = 0;
+ _concat_kernels_vector.reserve(_num_inputs);
+ _border_handlers_vector.reserve(_num_inputs);
for(unsigned int i = 0; i < _num_inputs; ++i)
{
- _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
- _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+ auto concat_kernel = support::cpp14::make_unique<NEDepthConcatenateLayerKernel>();
+ auto border_kernel = support::cpp14::make_unique<NEFillBorderKernel>();
+ concat_kernel->configure(inputs_vector.at(i), depth_offset, output);
+ border_kernel->configure(inputs_vector.at(i), concat_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+ _border_handlers_vector.emplace_back(std::move(border_kernel));
+ _concat_kernels_vector.emplace_back(std::move(concat_kernel));
depth_offset += inputs_vector.at(i)->info()->dimension(2);
}
@@ -80,7 +84,7 @@
// Output auto inizialitation if not yet initialized
TensorInfo tmp_output_info = *output->clone();
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimZ);
auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
unsigned int depth_offset = 0;
@@ -98,7 +102,7 @@
{
for(unsigned i = 0; i < _num_inputs; ++i)
{
- NEScheduler::get().schedule(&_border_handlers_vector[i], Window::DimX);
- NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimX);
+ NEScheduler::get().schedule(_border_handlers_vector[i].get(), Window::DimX);
+ NEScheduler::get().schedule(_concat_kernels_vector[i].get(), Window::DimX);
}
}
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index f0fd4cf..3bb69b1 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -31,112 +31,79 @@
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+
using namespace arm_compute::misc;
using namespace arm_compute::misc::shape_calculator;
-NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
- : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _accumulator(), _permuted_input(),
- _permuted_weights(), _permuted_output(), _has_bias(false), _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true), _permute(false),
- _is_activationlayer_enabled(false)
+namespace arm_compute
+{
+NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
+ _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
+ _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
{
}
-void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+void NEDepthwiseConvolutionLayer3x3::configure_generic(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_UNUSED(act_info);
PixelValue zero_value(0.f);
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _has_bias = biases != nullptr;
- _is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
- conv_info,
- input->info()->data_type(),
- depth_multiplier,
- input->info()->data_layout());
- _are_weights_reshaped = false;
- _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
- _permute = _is_optimized == _is_nchw;
-
// Initialize the intermediate accumulator tensor in case of quantized input
if(_is_quantized)
{
TensorShape accum_shape = output->info()->tensor_shape();
DataLayout accum_layout = output->info()->data_layout();
- if(!_is_optimized && !_is_nchw)
+ if(!_is_nchw)
{
permute(accum_shape, PermutationVector(1U, 2U, 0U));
accum_layout = DataLayout::NCHW;
}
+ _memory_group.manage(&_accumulator);
_accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
_accumulator.info()->set_data_layout(accum_layout);
zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
}
- if(_is_optimized)
+ if(!_is_nchw)
{
- ITensor *optimized_output = (_is_quantized) ? &_accumulator : output;
- if(_is_nchw)
- {
- // Configure the function to transform the input tensor from NCHW -> NHWC
- _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
- _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
- // Configure the function to transform the weights tensor from IHW -> HWI
- _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
- _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
- // Configure optimized depthwise
- _dwc_kernel.configure(&_permuted_input, &_permuted_weights, &_permuted_output, conv_info, depth_multiplier, DataLayout::NHWC);
+ // Configure the function to transform the weights tensor from HWI -> IHW
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
- // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
- _permuted_output.info()->set_data_layout(DataLayout::NHWC);
- _permute_output.configure(&_permuted_output, optimized_output, PermutationVector(1U, 2U, 0U));
+ // Configure depthwise
+ _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
- // Allocate tensors
- _permuted_input.allocator()->allocate();
- _permuted_weights.allocator()->allocate();
- _permuted_output.allocator()->allocate();
- }
- else
- {
- _dwc_kernel.configure(input, weights, optimized_output, conv_info, depth_multiplier, DataLayout::NHWC);
- }
+ // Configure border handler
+ _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+ // Allocate tensors
+ _permuted_input.allocator()->allocate();
}
else
{
- if(!_is_nchw)
- {
- // Configure the function to transform the input tensor from NHWC -> NCHW
- _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
- _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+ // Configure depthwise convolution kernel
+ _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
- // Configure the function to transform the weights tensor from HWI -> IHW
- _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
- _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-
- // Configure optimized depthwise
- _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier);
-
- // Configure border handler
- _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-
- // Allocate tensors
- _permuted_input.allocator()->allocate();
- _permuted_weights.allocator()->allocate();
- }
- else
- {
- // Configure depthwise convolution kernel
- _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
-
- // Configure border handler
- _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
- }
+ // Configure border handler
+ _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
}
// Configure biases accumulation
@@ -145,37 +112,138 @@
const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
+ int output_multiplier;
+ int output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_accumulator, biases, (_is_nchw || _is_optimized) ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
+ _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
_accumulator.allocator()->allocate();
}
else if(_has_bias)
{
- _output_stage_kernel.configure((_is_nchw || _is_optimized) ? output : &_permuted_output, biases);
+ _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
}
- if(!_is_optimized && !_is_nchw)
+ // Permute output
+ if(!_is_nchw)
{
// Configure the function to transform the convoluted output to NHWC
_permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
_permuted_output.allocator()->allocate();
}
+}
- //Configure Activation Layer
+void NEDepthwiseConvolutionLayer3x3::configure_optimized(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info)
+{
+ ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
+ const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
+ const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
+ _is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
+ if(!_is_activationlayer_enabled)
+ {
+ act_info_to_use = act_info;
+ }
+
+ if(_is_nchw)
+ {
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NCHW -> NHWC
+ _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+ _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+
+ // Configure the function to transform the weights tensor from IHW -> HWI
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+
+ // Configure optimized depthwise
+ _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use);
+
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+ _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+
+ // Allocate tensors
+ _permuted_input.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
+ }
+ else
+ {
+ _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use);
+ }
+}
+
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+ // idx_w and idx_h only used for validation
+ const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_UNUSED(idx_w);
+ ARM_COMPUTE_UNUSED(idx_h);
+
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _has_bias = biases != nullptr;
+ _is_optimized = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
+ weights->info(),
+ conv_info,
+ depth_multiplier, dilation);
+ _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
+ _permute = _is_optimized == _is_nchw;
+ _is_prepared = false;
_is_activationlayer_enabled = act_info.enabled();
+ // Configure appropriate pipeline
+ if(_is_optimized)
+ {
+ configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ }
+ else
+ {
+ configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ }
+
+ // Configure activation
if(_is_activationlayer_enabled)
{
_activationlayer_function.configure(output, nullptr, act_info);
}
}
-Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
+ const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
if(biases != nullptr)
{
@@ -184,14 +252,20 @@
ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
}
- const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
-
- if(is_quantized)
+ if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
+
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier));
}
//Validate Activation Layer
@@ -203,43 +277,14 @@
return Status{};
}
-void NEDepthwiseConvolutionLayer3x3::run()
+void NEDepthwiseConvolutionLayer3x3::run_generic()
{
- if(_is_first_run && _is_optimized)
- {
- _is_first_run = false;
- // Create convolver (deferred)
- _dwc_kernel.generate_convolver();
- }
-
- // Permute weights
- if(_permute)
- {
- if(!_are_weights_reshaped)
- {
- _are_weights_reshaped = true;
- _permute_weights.run();
- }
-
- _permute_input.run();
- }
-
- // Handle input
- if(!_is_optimized)
- {
- // Fill border
- NEScheduler::get().schedule(&_border_handler, Window::DimX);
- }
+ // Fill border
+ NEScheduler::get().schedule(&_border_handler, Window::DimX);
// Execute depthwise convolution
NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
- // Permute output
- if(_is_optimized && _is_nchw)
- {
- _permute_output.run();
- }
-
// Add biases
if(_has_bias || _is_quantized)
{
@@ -247,17 +292,71 @@
}
// Permute output
- if(!_is_optimized && !_is_nchw)
+ if(!_is_nchw)
{
_permute_output.run();
}
+}
+void NEDepthwiseConvolutionLayer3x3::run_optimized()
+{
+ // Run assembly function
+ _dwc_optimized_func.run();
+
+ // Permute output
+ if(_is_nchw)
+ {
+ _permute_output.run();
+ }
+}
+
+void NEDepthwiseConvolutionLayer3x3::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Permute input
+ if(_permute)
+ {
+ _permute_input.run();
+ }
+
+ _is_optimized ? run_optimized() : run_generic();
+
+ // Run activation
if(_is_activationlayer_enabled)
{
_activationlayer_function.run();
}
}
+void NEDepthwiseConvolutionLayer3x3::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Permute weights
+ if(_permute)
+ {
+ _permuted_weights.allocator()->allocate();
+ _permute_weights.run();
+ _original_weights->mark_as_unused();
+ }
+
+ // Prepare optimized function
+ if(_is_optimized)
+ {
+ _dwc_optimized_func.prepare();
+ if(!_permuted_weights.is_used())
+ {
+ _permuted_weights.allocator()->free();
+ }
+ }
+
+ _is_prepared = true;
+ }
+}
+
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
: _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _permute_input(),
_permute_weights(), _permute_output(), _activationlayer_function(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _permuted_input(), _permuted_weights(),
@@ -266,14 +365,21 @@
}
void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_UNUSED(channel_idx);
-
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_ERROR_ON((input->info()->dimension(channel_idx) * depth_multiplier) != weights->info()->dimension(channel_idx));
+ // idx_w and idx_h only used for validation
+ const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_UNUSED(idx_w);
+ ARM_COMPUTE_UNUSED(idx_h);
+
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
_is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
@@ -304,7 +410,7 @@
bool append_bias = (biases != nullptr) && !_is_quantized;
// Calculate output shape
- TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -332,7 +438,7 @@
shape_im2col.set(1, conv_size);
shape_im2col.set(2, weights_z);
_input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
- _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+ _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
// Weights reshape configuration
const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -356,7 +462,8 @@
const QuantizationInfo output_quant_info = output->info()->quantization_info();
float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
+ int output_multiplier;
+ int output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
_output_stage_kernel.configure(&_output_reshaped, biases, output_to_use, output_multiplier, output_shift, output_quant_info.offset);
_output_reshaped.allocator()->allocate();
@@ -399,14 +506,17 @@
}
Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) + (weights->dimension(width_idx) - 1) * (dilation.x() - 1) > input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) + (weights->dimension(height_idx) - 1) * (dilation.y() - 1) > input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom());
// Clone output to use auto init
auto output_clone = output->clone();
@@ -433,7 +543,7 @@
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
const bool append_bias = (biases != nullptr) && !is_quantized;
- TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
const size_t weights_w = weights_to_use->dimension(0);
const size_t weights_h = weights_to_use->dimension(1);
const size_t weights_z = weights_to_use->dimension(2);
@@ -460,7 +570,7 @@
shape_im2col.set(1, conv_size);
shape_im2col.set(2, weights_z);
TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
// Weights reshape configuration
const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -542,3 +652,4 @@
_is_prepared = true;
}
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index 0627977..e92b4bf 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,34 +24,20 @@
#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
+#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
-NEDequantizationLayer::NEDequantizationLayer()
- : _dequantize_kernel()
+namespace arm_compute
{
+void NEDequantizationLayer::configure(const ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEDequantizationLayerKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
}
-Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
- ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(input, output, min_max));
-
- return Status{};
+ return NEDequantizationLayerKernel::validate(input, output);
}
-
-void NEDequantizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
-
- // Configure kernel
- _dequantize_kernel.configure(input, output, min_max);
-}
-
-void NEDequantizationLayer::run()
-{
- NEScheduler::get().schedule(&_dequantize_kernel, Window::DimY);
-}
\ No newline at end of file
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 40e40c8..322bb2c 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -105,7 +105,7 @@
{
NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(&_conv_kernel, _dim_split);
if(_has_bias)
@@ -117,5 +117,4 @@
{
_activationlayer_function.run();
}
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
new file mode 100644
index 0000000..25ba1c8
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFFT1D.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false)
+{
+}
+
+void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NEFFT1D::validate(input->info(), output->info(), config));
+
+ // Decompose size to radix factors
+ const auto supported_radix = NEFFTRadixStageKernel::supported_radix();
+ const unsigned int N = input->info()->tensor_shape()[config.axis];
+ const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+ ARM_COMPUTE_ERROR_ON(decomposed_vector.empty());
+
+ // Flags
+ _run_scale = config.direction == FFTDirection::Inverse;
+
+ const bool is_c2r = input->info()->num_channels() == 2 && output->info()->num_channels() == 1;
+
+ // Configure digit reverse
+ FFTDigitReverseKernelInfo digit_reverse_config;
+ digit_reverse_config.axis = config.axis;
+ digit_reverse_config.conjugate = config.direction == FFTDirection::Inverse;
+ TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
+ _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
+ _memory_group.manage(&_digit_reversed_input);
+ _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+
+ // Create and configure FFT kernels
+ unsigned int Nx = 1;
+ _num_ffts = decomposed_vector.size();
+ _fft_kernels.resize(_num_ffts);
+ _axis = config.axis;
+
+ for(unsigned int i = 0; i < _num_ffts; ++i)
+ {
+ const unsigned int radix_for_stage = decomposed_vector.at(i);
+
+ FFTRadixStageKernelInfo fft_kernel_info;
+ fft_kernel_info.axis = config.axis;
+ fft_kernel_info.radix = radix_for_stage;
+ fft_kernel_info.Nx = Nx;
+ fft_kernel_info.is_first_stage = (i == 0);
+ _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+
+ Nx *= radix_for_stage;
+ }
+
+ // Configure scale kernel
+ if(_run_scale)
+ {
+ FFTScaleKernelInfo scale_config;
+ scale_config.scale = static_cast<float>(N);
+ scale_config.conjugate = config.direction == FFTDirection::Inverse;
+ is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+ }
+
+ // Allocate tensors
+ _digit_reversed_input.allocator()->allocate();
+ _digit_reverse_indices.allocator()->allocate();
+
+ // Init digit reverse indices
+ const auto digit_reverse_cpu = arm_compute::helpers::fft::digit_reverse_indices(N, decomposed_vector);
+ std::copy_n(digit_reverse_cpu.data(), N, reinterpret_cast<unsigned int *>(_digit_reverse_indices.buffer()));
+}
+
+Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+
+ // Check if FFT is decomposable
+ const auto supported_radix = NEFFTRadixStageKernel::supported_radix();
+ const unsigned int N = input->tensor_shape()[config.axis];
+ const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix);
+ ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ // All combinations are supported except real input with real output (i.e., both input channels set to 1)
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+void NEFFT1D::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ NEScheduler::get().schedule(&_digit_reverse_kernel, (_axis == 0 ? Window::DimY : Window::DimZ));
+
+ for(unsigned int i = 0; i < _num_ffts; ++i)
+ {
+ NEScheduler::get().schedule(&_fft_kernels[i], (_axis == 0 ? Window::DimY : Window::DimX));
+ }
+
+ // Run output scaling
+ if(_run_scale)
+ {
+ NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp
new file mode 100644
index 0000000..9210ecf
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFFT2D.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFFT2D.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Scheduler.h"
+
+namespace arm_compute
+{
+NEFFT2D::NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+{
+}
+
+void NEFFT2D::configure(const ITensor *input, ITensor *output, const FFT2DInfo &config)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NEFFT2D::validate(input->info(), output->info(), config));
+
+ // Setup first pass
+ FFT1DInfo first_pass_config;
+ first_pass_config.axis = config.axes.first;
+ first_pass_config.direction = config.direction;
+ _memory_group.manage(&_first_pass_tensor);
+ _first_pass_func.configure(input, &_first_pass_tensor, first_pass_config);
+
+ // Setup second pass
+ FFT1DInfo second_pass_config;
+ second_pass_config.axis = config.axes.second;
+ second_pass_config.direction = config.direction;
+ _second_pass_func.configure(&_first_pass_tensor, output, second_pass_config);
+ _first_pass_tensor.allocator()->allocate();
+}
+
+Status NEFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT2DInfo &config)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ // Create intermediate tensor info
+ TensorInfo first_pass_tensor(input->clone()->set_is_resizable(true).reset_padding().set_num_channels(2));
+
+ // Validate first pass
+ FFT1DInfo first_pass_config;
+ first_pass_config.axis = config.axes.first;
+ first_pass_config.direction = config.direction;
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(input, &first_pass_tensor, first_pass_config));
+
+ // Validate second pass
+ FFT1DInfo second_pass_config;
+ second_pass_config.axis = config.axes.second;
+ second_pass_config.direction = config.direction;
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config));
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+void NEFFT2D::run()
+{
+ _memory_group.acquire();
+
+ _first_pass_func.run();
+ _second_pass_func.run();
+
+ _memory_group.release();
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
new file mode 100644
index 0000000..0823007
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/fft.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+int pad_decomposable(int N)
+{
+ const auto supported_radix = NEFFTRadixStageKernel::supported_radix();
+
+ int pad = 0;
+ bool is_decomposed = false;
+ while(!is_decomposed)
+ {
+ const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
+ is_decomposed = !decomposed_vector.empty();
+ if(!is_decomposed)
+ {
+ ++pad;
+ }
+ }
+ return pad;
+}
+} // namespace
+
+NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager),
+ _flip_weights_func(),
+ _permute_input_func(),
+ _permute_output_func(),
+ _permute_weights_func(),
+ _permute_bias_func(),
+ _pad_input_func(),
+ _pad_weights_func(),
+ _transform_input_func(memory_manager),
+ _transform_weights_func(),
+ _itransform_output_func(memory_manager),
+ _prod_func(),
+ _reduce_func(),
+ _extract_output_func(),
+ _bias_add_func(),
+ _activation_layer_func(),
+ _permuted_input(),
+ _permuted_weights(),
+ _permuted_bias(),
+ _permuted_output(),
+ _padded_input(),
+ _padded_weights(),
+ _flip_axis(),
+ _flipped_weights(),
+ _transformed_input(),
+ _transformed_weights(),
+ _input_weights_product(),
+ _output_product(),
+ _output_reduced(),
+ _itransformed_output(),
+ _reshaped_output(),
+ _bias_output(),
+ _original_weights(nullptr),
+ _original_bias(nullptr),
+ _is_activationlayer_enabled(false),
+ _needs_permute(false),
+ _has_bias(false),
+ _is_prepared(false)
+{
+}
+
+void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
+ _original_weights = weights;
+ _original_bias = biases;
+
+ // Flat if bias addition is required
+ _has_bias = biases != nullptr;
+
+ // Get indices for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+ const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+ const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+ pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+ // Tensors to use
+ ITensor *input_to_use = input;
+ const ITensor *weights_to_use = weights;
+ ITensor *output_to_use = _has_bias ? &_bias_output : output;
+
+ // Permute bias
+ if(biases != nullptr)
+ {
+ _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
+ _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
+ }
+
+ // Permute input if needed
+ _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
+ if(_needs_permute)
+ {
+ _memory_group.manage(&_permuted_input);
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input_func.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+ // Configure the function to transform the weights tensor from HWI -> IHW
+ _permute_weights_func.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+ input_to_use = &_permuted_input;
+ weights_to_use = &_permuted_weights;
+ }
+
+ // Flip weights
+ _flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding());
+ _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+ _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
+
+ // Pad weights
+ const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+ _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
+
+ // Transform weights
+ _transform_weights_func = support::cpp14::make_unique<NEFFT2D>();
+ _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
+
+ // Pad input
+ const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+ _memory_group.manage(&_padded_input);
+ _pad_input_func.configure(input_to_use, &_padded_input, padding_in);
+ if(_needs_permute)
+ {
+ _permuted_input.allocator()->allocate();
+ }
+
+ // Transform input
+ _memory_group.manage(&_transformed_input);
+ _transform_input_func.configure(&_padded_input, &_transformed_input, FFT2DInfo());
+ _padded_input.allocator()->allocate();
+
+ // Perform product
+ _memory_group.manage(&_output_product);
+ _prod_func.configure(&_transformed_input, &_transformed_weights, &_output_product);
+ _transformed_input.allocator()->allocate();
+
+ // Perform reduction
+ _memory_group.manage(&_output_reduced);
+ _reduce_func.configure(&_output_product, &_output_reduced, 2, ReductionOperation::SUM);
+ _output_product.allocator()->allocate();
+
+ // Transform output
+ _memory_group.manage(&_itransformed_output);
+ FFT2DInfo itranform_info;
+ itranform_info.direction = FFTDirection::Inverse;
+ _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+ _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
+ _output_reduced.allocator()->allocate();
+
+ // Reshape output
+ TensorShape reshaped_shape = _itransformed_output.info()->tensor_shape();
+ reshaped_shape.remove_dimension(2);
+ _reshaped_output.allocator()->init(_itransformed_output.info()->clone()->set_tensor_shape(reshaped_shape));
+
+ // Extract correct region
+ const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
+ const int start_top = kernel_size.y() - conv_info.pad_top() - 1;
+ const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+ const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+ if(_has_bias)
+ {
+ _memory_group.manage(&_bias_output);
+ }
+ else if(_needs_permute)
+ {
+ output_to_use = &_permuted_output;
+ _memory_group.manage(&_permuted_output);
+ }
+ _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+ _reshaped_output.allocator()->allocate();
+ _itransformed_output.allocator()->allocate();
+
+ // Add bias
+ if(biases != nullptr)
+ {
+ output_to_use = output;
+ if(_needs_permute)
+ {
+ output_to_use = &_permuted_output;
+ _memory_group.manage(&_permuted_output);
+ }
+ auto_init_if_empty(*output_to_use->info(), *_bias_output.info());
+ _bias_add_func.configure(&_bias_output, &_permuted_bias, output_to_use, ConvertPolicy::WRAP);
+ _bias_output.allocator()->allocate();
+ }
+
+ // Permute output
+ if(_needs_permute)
+ {
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+ _permute_output_func.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+ // Allocate tensors
+ _permuted_output.allocator()->allocate();
+ }
+
+ // Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+ if(_is_activationlayer_enabled)
+ {
+ _activation_layer_func.configure(output, nullptr, act_info);
+ }
+
+ // Setup flip axis data
+ _flip_axis.allocator()->allocate();
+
+ auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+ axis_data[0] = 0;
+ axis_data[1] = 1;
+}
+
+Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+ // Get indices for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+
+ // Strides
+ const auto strides = conv_info.stride();
+ ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+
+ // Validate biases
+ if(biases != nullptr)
+ {
+ const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channels] != biases->tensor_shape().x());
+ }
+
+ // Checks performed when output is configured
+ if((output != nullptr) && (output->total_size() != 0))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+
+ // Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
+ }
+ }
+
+ return Status{};
+}
+
+void NEFFTConvolutionLayer::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Transform input
+ if(_needs_permute)
+ {
+ _permute_input_func.run();
+ }
+ _pad_input_func.run();
+ _transform_input_func.run();
+
+ // Perform operations to frequency domain
+ _prod_func.run();
+
+ _reduce_func.run();
+
+ // Transform output
+ _itransform_output_func.run();
+ _reshaped_output.allocator()->import_memory(_itransformed_output.buffer());
+ _extract_output_func.run();
+
+ // Add bias
+ if(_has_bias)
+ {
+ _bias_add_func.run();
+ }
+ if(_needs_permute)
+ {
+ _permute_output_func.run();
+ }
+
+ // Run activation layer
+ if(_is_activationlayer_enabled)
+ {
+ _activation_layer_func.run();
+ }
+}
+
+void NEFFTConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Permute bias to NCHW
+ if(_original_bias != nullptr)
+ {
+ _permuted_bias.allocator()->allocate();
+ _permute_bias_func.run();
+ _original_bias->mark_as_unused();
+ }
+
+ const ITensor *cur_weights = _original_weights;
+
+ // Permute weights
+ if(_needs_permute)
+ {
+ ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
+
+ _permuted_weights.allocator()->allocate();
+ _permute_weights_func.run();
+ cur_weights->mark_as_unused();
+ cur_weights = &_permuted_weights;
+ }
+
+ // Flip weights
+ _flipped_weights.allocator()->allocate();
+ _flip_weights_func.run();
+ cur_weights->mark_as_unused();
+
+ // Pad weights
+ _padded_weights.allocator()->allocate();
+ _pad_weights_func.run();
+ _flipped_weights.mark_as_unused();
+ _flipped_weights.allocator()->free();
+
+ // Transform weights to frequency domain
+ _transformed_weights.allocator()->allocate();
+ _transform_weights_func->run();
+ _transform_weights_func.reset();
+
+ _padded_weights.mark_as_unused();
+ _padded_weights.allocator()->free();
+
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index 4137b1d..af35301 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -93,7 +93,7 @@
{
NEScheduler::get().schedule(&_border_handler, Window::DimZ);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
@@ -103,6 +103,4 @@
}
NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 45e21b5..e1a17db 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -228,7 +228,8 @@
if(_is_quantized)
{
float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
- int output_multiplier, output_shift;
+ int output_multiplier;
+ int output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
_gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
_gemmlowp_output.allocator()->allocate();
@@ -333,7 +334,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Linearize input if it comes from a convolutional layer
if(_is_fc_after_conv)
@@ -363,8 +364,6 @@
NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
}
}
-
- _memory_group.release();
}
void NEFullyConnectedLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 914f088..55bcc45 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -238,16 +238,14 @@
{
prepare();
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
if(_asm_glue.is_configured())
{
- _memory_group.acquire();
_asm_glue.run();
- _memory_group.release();
}
else
{
- _memory_group.acquire();
-
if(!_run_vector_matrix_multiplication)
{
// Run interleave kernel
@@ -262,8 +260,6 @@
NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
- _memory_group.release();
-
// Run matrix addition kernel
if(_run_addition)
{
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 470e922..55e067f 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -35,7 +35,7 @@
{
namespace
{
-std::unique_ptr<IFunction> create_function_all_types(arm_gemm::KernelDescription gemm_kernel_info,
+std::unique_ptr<IFunction> create_function_all_types(const arm_gemm::KernelDescription &gemm_kernel_info,
const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
std::shared_ptr<IMemoryManager> memory_manager)
@@ -375,7 +375,7 @@
void NEGEMMAssemblyDispatch::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_function != nullptr)
{
_function->run();
@@ -385,6 +385,5 @@
ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
_arm_gemm->run();
}
- _memory_group.release();
}
} //namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index be7cc2d..a2c4e8a 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -90,16 +90,17 @@
}
NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
- : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
- _add_bias_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false),
- _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
+ : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
+ _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false),
+ _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
{
}
-void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, ITensor *output, int gemm_3d_depth)
+void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info, int gemm_3d_depth)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
- ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info(), gemm_3d_depth, _skip_im2col));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(), act_info, gemm_3d_depth,
+ _skip_im2col));
const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
@@ -114,7 +115,41 @@
input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
- _mm_gemmlowp.configure(input, weights, nullptr, output, gemm_info);
+ const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input_quantization_info : output->info()->quantization_info();
+
+ float multiplier = input_quantization_info.scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+ int output_multiplier;
+ int output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+ // Merge activation with output stage
+ int min_activation = 0;
+ int max_activation = 0;
+
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+ if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+ {
+ const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+ const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+ max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+
+ _is_activationlayer_enabled = false;
+ }
+
+ GEMMLowpOutputStageInfo output_info;
+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_info.gemmlowp_offset = output_quant_info.offset;
+ output_info.gemmlowp_multiplier = output_multiplier;
+ output_info.gemmlowp_shift = output_shift;
+ output_info.gemmlowp_min_bound = min_activation;
+ output_info.gemmlowp_max_bound = max_activation;
+
+ _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info));
// Revert back QuantizatioInfo as input and weights could be used in other convolution layers
input->info()->set_quantization_info(input_quantization_info);
@@ -127,9 +162,11 @@
}
}
-Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int gemm_3d_depth, bool skip_im2col)
+Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act_info,
+ int gemm_3d_depth, bool skip_im2col)
{
- const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ const bool is_activation_enabled = act_info.enabled();
const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
@@ -145,8 +182,40 @@
input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+ const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input_quantization_info : output->quantization_info();
+
+ float multiplier = input_quantization_info.scale * weights->quantization_info().scale / output_quant_info.scale;
+ int output_multiplier;
+ int output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+ // Merge activation with output stage
+ int min_activation = 0;
+ int max_activation = 0;
+
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+ if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+ {
+ const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+ const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+ max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+ }
+
+ GEMMLowpOutputStageInfo output_info;
+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_info.gemmlowp_offset = output_quant_info.offset;
+ output_info.gemmlowp_multiplier = output_multiplier;
+ output_info.gemmlowp_shift = output_shift;
+ output_info.gemmlowp_min_bound = min_activation;
+ output_info.gemmlowp_max_bound = max_activation;
+
// Perform validation step on GEMMLowp
- return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), nullptr, output, gemm_info);
+ return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info));
}
else
{
@@ -155,19 +224,18 @@
}
}
-Status NEGEMMConvolutionLayer::validate_gemm3d(DataType data_type, int gemm_3d_depth, bool skip_im2col)
+Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
{
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const DataType output_gemm_data_type = is_quantized ? DataType::S32 : data_type;
- const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth;
- const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U;
+ const DataType data_type = input_info->data_type();
+ const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth;
+ const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U;
// Set dummy tensor shapes for the validation
- const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type);
+ const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type);
- const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, output_gemm_data_type);
+ const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
- return validate_mm(&dummy_input_info, &dummy_weights_info, &dummy_output_info, gemm_3d_depth, skip_im2col);
+ return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col);
}
void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
@@ -202,9 +270,8 @@
_append_bias = (biases != nullptr) && (!_is_quantized);
_is_activationlayer_enabled = act_info.enabled();
- const ITensor *gemm_input_to_use = input;
- ITensor *gemm_output_to_use = output;
- ITensor *gemm_output_staged_to_use = output;
+ const ITensor *gemm_input_to_use = input;
+ ITensor *gemm_output_to_use = output;
// Get convolved dimensions
unsigned int conv_w = 0;
@@ -219,7 +286,7 @@
// Check if GEMM3D is supported
if(data_layout == DataLayout::NHWC)
{
- _skip_col2im = bool(validate_gemm3d(input->info()->data_type(), conv_h, true));
+ _skip_col2im = bool(validate_gemm3d(input->info(), act_info, conv_h, true));
// If not supported, we need to perform im2col and col2im (or reshape layer)
if(!_skip_col2im)
{
@@ -262,26 +329,17 @@
}
// Create temporary GEMM output tensor in case we cannot skip col2im
- if(!_skip_col2im || _is_quantized)
+ if(!_skip_col2im)
{
- // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
- const DataType gemm_data_type = _is_quantized ? DataType::S32 : data_type;
- TensorShape shape_gemm;
+ TensorShape shape_gemm;
- if(_is_quantized && _skip_col2im)
- {
- shape_gemm = output->info()->tensor_shape();
- }
- else
- {
- // Calculate GEMM output shape
- shape_gemm = _im2col_output.info()->tensor_shape();
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, conv_w * conv_h);
- }
+ // Calculate GEMM output shape
+ shape_gemm = _im2col_output.info()->tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, conv_w * conv_h);
// FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
- TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
+ TensorInfo info_gemm(shape_gemm, 1, data_type);
info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
_gemm_output.allocator()->init(info_gemm);
_memory_group.manage(&_gemm_output);
@@ -293,62 +351,24 @@
// Configure GEMM
// In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
- configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, gemm_3d_depth);
+ configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, gemm_3d_depth);
if(!_skip_im2col)
{
_im2col_output.allocator()->allocate();
}
- // Configure output stage for quantized case
- if(_is_quantized)
- {
- const QuantizationInfo input_quant_info = input->info()->quantization_info();
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input_quant_info : output->info()->quantization_info();
-
- float multiplier = input_quant_info.scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-
- if(!_skip_col2im)
- {
- _memory_group.manage(&_tmp_output);
- gemm_output_staged_to_use = &_tmp_output;
- }
-
- // Merge activation with output stage
- int min_activation = 0;
- int max_activation = 0;
-
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
- if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
- {
- const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
- const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
-
- min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
- max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
-
- _is_activationlayer_enabled = false;
- }
-
- _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset, min_activation, max_activation);
- }
-
if(!_skip_col2im)
{
if(_data_layout == DataLayout::NCHW)
{
// Configure col2im
- _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, Size2D(conv_w, conv_h));
+ _col2im_kernel.configure(gemm_output_to_use, output, Size2D(conv_w, conv_h));
}
else
{
// Configure reshape layer
- _reshape_layer.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output);
+ _reshape_layer.configure(gemm_output_to_use, output);
}
}
@@ -394,11 +414,13 @@
const unsigned int kernel_width = weights->dimension(idx_width);
const unsigned int kernel_height = weights->dimension(idx_height);
- TensorInfo im2col_reshaped_info, info_gemm, tmp_info, weights_reshaped_info;
- const ITensorInfo *gemm_input_to_use = input;
- const ITensorInfo *gemm_output_to_use = output;
- const ITensorInfo *gemm_output_staged_to_use = output;
- const ITensorInfo *weights_to_use = weights;
+ TensorInfo im2col_reshaped_info{};
+ TensorInfo info_gemm{};
+ TensorInfo tmp_info{};
+ TensorInfo weights_reshaped_info{};
+ const ITensorInfo *gemm_input_to_use = input;
+ const ITensorInfo *gemm_output_to_use = output;
+ const ITensorInfo *weights_to_use = weights;
const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
const bool append_bias = (biases != nullptr) && (!is_quantized);
@@ -420,7 +442,7 @@
bool skip_col2im = false;
if(data_layout == DataLayout::NHWC)
{
- skip_col2im = bool(validate_gemm3d(input->data_type(), conv_h, true));
+ skip_col2im = bool(validate_gemm3d(input, act_info, conv_h, true));
// If not supported, we need to perform im2col and col2im (or reshape layer)
if(!skip_col2im)
{
@@ -431,7 +453,7 @@
if(skip_col2im)
{
// If not supported, we need to perform im2col and col2im (or reshape layer)
- if(!bool(validate_gemm3d(input->data_type(), conv_h, skip_im2col)))
+ if(!bool(validate_gemm3d(input, act_info, conv_h, skip_im2col)))
{
skip_im2col = false;
skip_col2im = false;
@@ -495,68 +517,25 @@
}
// Create temporary GEMM output tensor in case we cannot skip col2im
- const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type;
if(!skip_col2im)
{
TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, conv_w * conv_h);
- info_gemm = TensorInfo(shape_gemm, 1, gemm_data_type);
+ info_gemm = TensorInfo(shape_gemm, 1, data_type);
}
else
{
- info_gemm = TensorInfo(output->tensor_shape(), 1, gemm_data_type);
+ info_gemm = TensorInfo(output->tensor_shape(), 1, data_type);
}
info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
gemm_output_to_use = &info_gemm;
-
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, skip_col2im ? conv_h : 0, skip_im2col));
-
- if(is_quantized)
- {
- const QuantizationInfo input_quant_info = input->quantization_info();
- const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input_quant_info : output->quantization_info();
- const float multiplier = input_quant_info.scale * weights_to_use->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-
- if(!skip_col2im)
- {
- tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8);
- tmp_info.set_quantization_info(output->quantization_info()).set_data_layout(data_layout);
- gemm_output_staged_to_use = &tmp_info;
- }
-
- // Merge activation with output stage
- int min_activation = 0;
- int max_activation = 0;
-
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
-
- if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
- {
- const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
- const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
-
- min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
- max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
-
- is_activation_enabled = false;
- }
-
- // Validate output stage for quantized case
- NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, min_activation, max_activation);
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, skip_col2im ? conv_h : 0, skip_im2col));
// Validate Col2Im/ReshapeLayer
if(!skip_col2im && (data_layout == DataLayout::NCHW))
{
- ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(is_quantized ? gemm_output_staged_to_use : gemm_output_to_use,
- output,
- Size2D(conv_w, conv_h)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h)));
}
//Validate Activation Layer
@@ -572,7 +551,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(!_skip_im2col)
{
@@ -586,9 +565,6 @@
{
// Run gemmlowp
_mm_gemmlowp.run();
-
- // Run output stage
- _gemmlowp_output_stage.run();
}
else
{
@@ -618,8 +594,6 @@
{
_activationlayer_function.run();
}
-
- _memory_group.release();
}
void NEGEMMConvolutionLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 47c3358..ede89bf 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -1,4 +1,5 @@
-/* Copyright (c) 2017-2018 ARM Limited.
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -116,7 +117,7 @@
void NEGEMMLowpAssemblyMatrixMultiplyCore::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_mtx_a_reshape_kernel)
{
NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
@@ -135,6 +136,4 @@
{
NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
}
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 5286f11..54f49a6 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -42,8 +42,8 @@
NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
- _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false),
- _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
+ _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _original_b(nullptr), _a_offset(0), _b_offset(0),
+ _run_vector_matrix_multiplication(false), _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false)
{
}
@@ -53,6 +53,9 @@
ARM_COMPUTE_UNUSED(c);
ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
+ const ITensor *matrix_a = a;
+ const ITensor *matrix_b = b;
+
// Clear state
_mtx_a_reshape_kernel = nullptr;
_mtx_b_reshape_kernel = nullptr;
@@ -65,6 +68,18 @@
_is_prepared = false;
_original_b = b;
+ // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+ if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ {
+ _fuse_output_stage = true;
+
+ _memory_group.manage(&_mm_result_s32);
+
+ TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
+
+ _mm_result_s32.allocator()->init(info_mm_result_s32);
+ }
+
#ifdef __aarch64__
switch(a->info()->data_type())
{
@@ -72,7 +87,7 @@
case DataType::U8:
case DataType::S8:
{
- _asm_glue.configure(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run);
+ _asm_glue.configure(a, b, _fuse_output_stage ? &_mm_result_s32 : output, 1.f, 0.f, _reshape_b_only_on_first_run);
_dot_product_path = _asm_glue.is_configured();
break;
}
@@ -83,51 +98,35 @@
}
}
#endif /* __aarch64__ */
- if(!_dot_product_path)
+ if(!(_dot_product_path || _run_vector_matrix_multiplication))
{
- if(_run_vector_matrix_multiplication)
+ matrix_a = &_tmp_a;
+ matrix_b = &_tmp_b;
+
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorInfo a_info(compute_interleaved_shape(*a->info()), 1, a->info()->data_type(), a->info()->quantization_info());
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
+ _tmp_a.allocator()->init(a_info);
+ _tmp_b.allocator()->init(b_info);
+ _memory_group.manage(&_tmp_a);
+ if(!_reshape_b_only_on_first_run)
{
- // Configure matrix multiply kernel
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
- k->configure(a, b, output);
- _mm_kernel = std::move(k);
- }
+ _memory_group.manage(&_tmp_b);
}
- else
+
+ // Configure interleave kernel
{
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorInfo info_a = a->info()->clone()->set_tensor_shape(compute_interleaved_shape(*a->info())).set_is_resizable(true);
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorInfo info_b = b->info()->clone()->set_tensor_shape(compute_transpose1xW_shape(*b->info())).set_is_resizable(true);
- _tmp_a.allocator()->init(info_a);
- _tmp_b.allocator()->init(info_b);
- _memory_group.manage(&_tmp_a);
- if(!_reshape_b_only_on_first_run)
- {
- _memory_group.manage(&_tmp_b);
- }
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+ k->configure(a, &_tmp_a);
+ _mtx_a_reshape_kernel = std::move(k);
+ }
- // Configure interleave kernel
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
- k->configure(a, &_tmp_a);
- _mtx_a_reshape_kernel = std::move(k);
- }
-
- // Configure transpose kernel
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
- k->configure(b, &_tmp_b);
- _mtx_b_reshape_kernel = std::move(k);
- }
-
- // Configure matrix multiply kernel
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
- k->configure(&_tmp_a, &_tmp_b, output);
- _mm_kernel = std::move(k);
- }
+ // Configure transpose kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+ k->configure(b, &_tmp_b);
+ _mtx_b_reshape_kernel = std::move(k);
}
}
@@ -158,8 +157,33 @@
_mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
}
- // Configure offset contribution kernel
- _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+ if(_fuse_output_stage)
+ {
+ // Configure matrix multiply kernel
+ if(!_dot_product_path)
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ k->configure(matrix_a, matrix_b, &_mm_result_s32);
+ _mm_kernel = std::move(k);
+ }
+
+ _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
+ _a_offset, _b_offset, gemm_info.gemmlowp_output_stage());
+
+ _mm_result_s32.allocator()->allocate();
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ if(!_dot_product_path)
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ k->configure(matrix_a, matrix_b, output);
+ _mm_kernel = std::move(k);
+ }
+ // Configure offset contribution kernel
+ _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+ }
// Allocate tensors
if(!_dot_product_path && !_run_vector_matrix_multiplication)
@@ -185,43 +209,53 @@
Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
"The product AB is defined only if the number of columns in A is equal to the number of rows in B");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+ const ITensorInfo *matrix_a_info = a;
+ const ITensorInfo *matrix_b_info = b;
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+ TensorInfo mm_result_s32_info{};
+
int32_t a_offset = a->quantization_info().offset;
int32_t b_offset = b->quantization_info().offset;
const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ bool fuse_output_stage = gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+ if(fuse_output_stage)
+ {
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+ }
+
// Check if we need to run the optimized assembly kernel
- const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, output, 1.f, 0.f, reshape_b_only_on_first_run));
+ const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f, reshape_b_only_on_first_run));
if(run_optimised)
{
- if(output->total_size() != 0)
+ ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+ if(gemm_info.depth_output_gemm3d() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
- if(gemm_info.depth_output_gemm3d() != 0)
+ if(gemm_info.reinterpret_input_as_3d())
{
- if(gemm_info.reinterpret_input_as_3d())
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
- }
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
}
}
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ }
}
else
{
@@ -231,6 +265,9 @@
const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
if(!run_vector_matrix_multiplication)
{
+ matrix_a_info = &tmp_a_info;
+ matrix_b_info = &tmp_b_info;
+
// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
TensorShape shape_tmp_a = a->tensor_shape();
shape_tmp_a.set(0, a->dimension(0) * 4);
@@ -241,20 +278,17 @@
shape_tmp_b.set(0, b->dimension(1) * 16);
shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
- TensorInfo info_a = a->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
- TensorInfo info_b = b->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(shape_tmp_a));
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
}
}
- TensorInfo info_vector_sum_col, info_vector_sum_row;
+ TensorInfo info_vector_sum_col{};
+ TensorInfo info_vector_sum_row{};
// Validate matrix B reduction kernel only if _a_offset is not equal to 0
if(a_offset != 0)
@@ -274,12 +308,32 @@
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
}
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- a_offset, b_offset));
+ if(fuse_output_stage)
+ {
+ if(!run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
+ }
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ c, output, a_offset, b_offset,
+ gemm_info.gemmlowp_output_stage()));
+ }
+ else
+ {
+ if(!run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+ }
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ a_offset, b_offset));
+ }
return Status{};
}
@@ -287,7 +341,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Reshape inputs
if(_mtx_a_reshape_kernel)
@@ -321,10 +375,16 @@
NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
}
- // Run offset contribution kernel
- NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
-
- _memory_group.release();
+ if(_fuse_output_stage)
+ {
+ // Run offset contribution kernel
+ NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
+ }
+ else
+ {
+ // Run offset contribution kernel
+ NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+ }
}
void NEGEMMLowpMatrixMultiplyCore::prepare()
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index b010ca0..3c7411e 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -59,10 +59,8 @@
{
NEScheduler::get().schedule(&_border_handler, Window::DimZ);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index 8a85bba..0dbcb12 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -74,11 +74,6 @@
if(num_levels > 1)
{
- _horizontal_border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
- _vertical_border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
- _horizontal_reduction = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
- _vertical_reduction = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
-
// Apply half scale to the X dimension of the tensor shape
TensorShape tensor_shape = pyramid->info()->tensor_shape();
tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
@@ -86,19 +81,33 @@
PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::S16);
_tmp.init(pyramid_info);
+ _horizontal_reduction.reserve(num_levels);
+ _vertical_reduction.reserve(num_levels);
+ _horizontal_border_handler.reserve(num_levels);
+ _vertical_border_handler.reserve(num_levels);
+
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
/* Configure horizontal kernel */
- _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
+ auto horizontal_kernel = support::cpp14::make_unique<NEGaussianPyramidHorKernel>();
+ horizontal_kernel->configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
/* Configure vertical kernel */
- _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
+ auto vertical_kernel = support::cpp14::make_unique<NEGaussianPyramidVertKernel>();
+ vertical_kernel->configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
/* Configure border */
- _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+ auto horizontal_border_kernel = support::cpp14::make_unique<NEFillBorderKernel>();
+ horizontal_border_kernel->configure(_pyramid->get_pyramid_level(i), horizontal_kernel->border_size(), border_mode, PixelValue(constant_border_value));
/* Configure border */
- _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
+ auto vertical_border_kernel = support::cpp14::make_unique<NEFillBorderKernel>();
+ vertical_border_kernel->configure(_tmp.get_pyramid_level(i), vertical_kernel->border_size(), border_mode, PixelValue(pixel_value_u16));
+
+ _vertical_border_handler.emplace_back(std::move(vertical_border_kernel));
+ _horizontal_border_handler.emplace_back(std::move(horizontal_border_kernel));
+ _vertical_reduction.emplace_back(std::move(vertical_kernel));
+ _horizontal_reduction.emplace_back(std::move(horizontal_kernel));
}
_tmp.allocate();
@@ -117,10 +126,10 @@
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
- NEScheduler::get().schedule(_horizontal_border_handler.get() + i, Window::DimZ);
- NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
- NEScheduler::get().schedule(_vertical_border_handler.get() + i, Window::DimZ);
- NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
+ NEScheduler::get().schedule(_horizontal_border_handler[i].get(), Window::DimZ);
+ NEScheduler::get().schedule(_horizontal_reduction[i].get(), Window::DimY);
+ NEScheduler::get().schedule(_vertical_border_handler[i].get(), Window::DimZ);
+ NEScheduler::get().schedule(_vertical_reduction[i].get(), Window::DimY);
}
}
@@ -147,19 +156,20 @@
if(num_levels > 1)
{
- _gaus5x5 = arm_compute::support::cpp14::make_unique<NEGaussian5x5[]>(num_levels - 1);
- _scale_nearest = arm_compute::support::cpp14::make_unique<NEScale[]>(num_levels - 1);
-
PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
_tmp.init(pyramid_info);
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
/* Configure gaussian 5x5 */
- _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+ auto gaus5x5_kernel = support::cpp14::make_unique<NEGaussian5x5>();
+ gaus5x5_kernel->configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+ _gaus5x5.emplace_back(std::move(gaus5x5_kernel));
/* Configure scale */
- _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
+ auto scale_kernel = support::cpp14::make_unique<NEScale>();
+ scale_kernel->configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
+ _scale_nearest.emplace_back(std::move(scale_kernel));
}
_tmp.allocate();
@@ -178,7 +188,7 @@
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
- _gaus5x5[i].run();
- _scale_nearest[i].run();
+ _gaus5x5[i].get()->run();
+ _scale_nearest[i].get()->run();
}
}
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index 5e98269..8efc091 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -95,7 +95,7 @@
void NEHOGDescriptor::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run gradient
_gradient.run();
@@ -105,6 +105,4 @@
// Run block normalization kernel
NEScheduler::get().schedule(&_block_norm, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index efc8690..90785fe 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,13 +80,11 @@
void NEHOGGradient::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run derivative
_derivative.run();
// Run magnitude/phase kernel
NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index 8c834e2..26abc9d 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -126,12 +126,12 @@
_num_block_norm_kernel = input_block_norm.size(); // Number of NEHOGBlockNormalizationKernel kernels to compute
_num_hog_detect_kernel = input_hog_detect.size(); // Number of NEHOGDetector functions to compute
- _orient_bin_kernel = arm_compute::support::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
- _block_norm_kernel = arm_compute::support::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
- _hog_detect_kernel = arm_compute::support::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
+ _orient_bin_kernel.reserve(_num_orient_bin_kernel);
+ _block_norm_kernel.reserve(_num_block_norm_kernel);
+ _hog_detect_kernel.reserve(_num_hog_detect_kernel);
+ _hog_space.reserve(_num_orient_bin_kernel);
+ _hog_norm_space.reserve(_num_block_norm_kernel);
_non_maxima_kernel = arm_compute::support::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
- _hog_space = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
- _hog_norm_space = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
// Allocate tensors for magnitude and phase
TensorInfo info_mag(shape_img, Format::S16);
@@ -167,13 +167,17 @@
// Allocate HOG space
TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
- _hog_space[i].allocator()->init(info_space);
+ auto hog_space_tensor = support::cpp14::make_unique<Tensor>();
+ hog_space_tensor->allocator()->init(info_space);
// Manage intermediate buffers
- _memory_group.manage(_hog_space.get() + i);
+ _memory_group.manage(hog_space_tensor.get());
// Initialise orientation binning kernel
- _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+ auto orient_bin_kernel = support::cpp14::make_unique<NEHOGOrientationBinningKernel>();
+ orient_bin_kernel->configure(&_mag, &_phase, hog_space_tensor.get(), multi_hog->model(idx_multi_hog)->info());
+ _orient_bin_kernel.emplace_back(std::move(orient_bin_kernel));
+ _hog_space.emplace_back(std::move(hog_space_tensor));
}
// Allocate intermediate tensors
@@ -188,19 +192,23 @@
// Allocate normalized HOG space
TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
- _hog_norm_space[i].allocator()->init(tensor_info);
+ auto hog_norm_space_tensor = support::cpp14::make_unique<Tensor>();
+ hog_norm_space_tensor->allocator()->init(tensor_info);
// Manage intermediate buffers
- _memory_group.manage(_hog_norm_space.get() + i);
+ _memory_group.manage(hog_norm_space_tensor.get());
// Initialize block normalization kernel
- _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+ auto block_norm_kernel = support::cpp14::make_unique<NEHOGBlockNormalizationKernel>();
+ block_norm_kernel->configure(_hog_space[idx_orient_bin].get(), hog_norm_space_tensor.get(), multi_hog->model(idx_multi_hog)->info());
+ _block_norm_kernel.emplace_back(std::move(block_norm_kernel));
+ _hog_norm_space.emplace_back(std::move(hog_norm_space_tensor));
}
// Allocate intermediate tensors
for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
{
- _hog_space[i].allocator()->allocate();
+ _hog_space[i].get()->allocator()->allocate();
}
// Configure HOG detector kernel
@@ -208,7 +216,9 @@
{
const size_t idx_block_norm = input_hog_detect[i];
- _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+ auto hog_detect_kernel = support::cpp14::make_unique<NEHOGDetector>();
+ hog_detect_kernel->configure(_hog_norm_space[idx_block_norm].get(), multi_hog->model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+ _hog_detect_kernel.emplace_back(std::move(hog_detect_kernel));
}
// Configure non maxima suppression kernel
@@ -217,7 +227,7 @@
// Allocate intermediate tensors
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
{
- _hog_norm_space[i].allocator()->allocate();
+ _hog_norm_space[i]->allocator()->allocate();
}
}
@@ -225,7 +235,7 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Reset detection window
_detection_windows->clear();
@@ -234,21 +244,21 @@
_gradient_kernel.run();
// Run orientation binning kernel
- for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+ for(auto &kernel : _orient_bin_kernel)
{
- NEScheduler::get().schedule(_orient_bin_kernel.get() + i, Window::DimY);
+ NEScheduler::get().schedule(kernel.get(), Window::DimY);
}
// Run block normalization kernel
- for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+ for(auto &kernel : _block_norm_kernel)
{
- NEScheduler::get().schedule(_block_norm_kernel.get() + i, Window::DimY);
+ NEScheduler::get().schedule(kernel.get(), Window::DimY);
}
// Run HOG detector kernel
- for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+ for(auto &kernel : _hog_detect_kernel)
{
- _hog_detect_kernel[i].run();
+ kernel->run();
}
// Run non-maxima suppression kernel if enabled
@@ -256,6 +266,4 @@
{
NEScheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
}
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index db5e926..3eadbee 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -90,7 +90,7 @@
_score.allocator()->init(tensor_info_score);
_nonmax.allocator()->init(tensor_info_score);
- _corners_list = arm_compute::support::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+ _corners_list.resize(shape.x() * shape.y());
// Set/init Sobel kernel accordingly with gradient_size
switch(gradient_size)
@@ -171,20 +171,20 @@
_score.allocator()->allocate();
// Init corner candidates kernel
- _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+ _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates);
// Allocate once all the configure methods have been called
_nonmax.allocator()->allocate();
// Init euclidean distance
- _sort_euclidean.configure(_corners_list.get(), corners, &_num_corner_candidates, min_dist);
+ _sort_euclidean.configure(_corners_list.data(), corners, &_num_corner_candidates, min_dist);
}
void NEHarrisCorners::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Init to 0 number of corner candidates
_num_corner_candidates = 0;
@@ -207,6 +207,4 @@
// Run sort & euclidean distance
NEScheduler::get().schedule(&_sort_euclidean, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
index f333ecb..d56bd7c 100644
--- a/src/runtime/NEON/functions/NEHistogram.cpp
+++ b/src/runtime/NEON/functions/NEHistogram.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,7 +34,7 @@
using namespace arm_compute;
NEHistogram::NEHistogram()
- : _histogram_kernel(), _local_hist(), _window_lut(arm_compute::support::cpp14::make_unique<uint32_t[]>(window_lut_default_size)), _local_hist_size(0)
+ : _histogram_kernel(), _local_hist(), _window_lut(window_lut_default_size), _local_hist_size(0)
{
}
@@ -45,10 +45,10 @@
// Allocate space for threads local histograms
_local_hist_size = output->num_bins() * NEScheduler::get().num_threads();
- _local_hist = arm_compute::support::cpp14::make_unique<uint32_t[]>(_local_hist_size);
+ _local_hist.resize(_local_hist_size);
// Configure kernel
- _histogram_kernel.configure(input, output, _local_hist.get(), _window_lut.get());
+ _histogram_kernel.configure(input, output, _local_hist.data(), _window_lut.data());
}
void NEHistogram::run()
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index 56da966..c9ab5c9 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -68,11 +68,9 @@
void NEL2NormalizeLayer::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
_reduce_func.run();
NEScheduler::get().schedule(&_normalize_kernel, Window::DimY);
-
- _memory_group.release();
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index 9e7a713..3d3c6a1 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -43,10 +43,10 @@
_pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state(), _accum_cell_state1(), _accum_cell_state2(),
_pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(),
_accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(),
- _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _input_gate_out5(),
- _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(),
- _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false),
- _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false)
+ _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
+ _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(),
+ _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(),
+ _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false)
{
}
@@ -96,22 +96,32 @@
// Configure block that calculates the forget gate
// forget_gate = Activation(input * input_to_forget_weights + output_state_in * recurrent_to_forget_weights + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
- TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ // We optimize this as follows:
+ // forget_gate = Activation( (input,output_state_in) * (input_to_forget_weights,recurrent_to_forget_weights) + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
_forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
_forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_forget_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _memory_group.manage(&_forget_gate_out1);
- _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1);
+ std::vector<const ITensor *> inputs_vector;
+ inputs_vector.emplace_back(input);
+ inputs_vector.emplace_back(output_state_in);
+
_memory_group.manage(&_forget_gate_out2);
- _transpose_forget_gate.configure(recurrent_to_forget_weights, &_forget_gate_out2);
- _memory_group.manage(&_forget_gate_out3);
- _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
- _forget_gate_out2.allocator()->allocate();
+ _concat_inputs_forget_gate.configure(inputs_vector, &_forget_gate_out2);
+
+ std::vector<const ITensor *> weights_vector;
+
+ weights_vector.emplace_back(input_to_forget_weights);
+ weights_vector.emplace_back(recurrent_to_forget_weights);
+
+ _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6);
+
_memory_group.manage(&_forget_gate_out5);
- _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
- _forget_gate_out1.allocator()->allocate();
+ _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, forget_gate_bias, &_forget_gate_out5);
+ _memory_group.manage(&_forget_gate_out1);
+ _memory_group.manage(&_forget_gate_out3);
+ _forget_gate_out6.allocator()->allocate();
+
Tensor *forget_gate_out = &_forget_gate_out5;
if(lstm_params.has_peephole_opt())
{
@@ -134,6 +144,8 @@
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
// input_gate = 1 - forget_gate, with CIFG
+ // We optimize this as follows:
+ // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
_input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
Tensor *input_gate_out = &_input_gate_out1;
if(lstm_params.has_cifg_opt())
@@ -146,31 +158,29 @@
}
else
{
- TensorShape input_gate_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
-
- _input_gate_out2.allocator()->init(TensorInfo(input_gate_shape, 1, input->info()->data_type()));
_input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_input_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _input_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ std::vector<const ITensor *> lstm_weights;
+ lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+ lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+
+ _concat_weights_input_gate.configure(lstm_weights, &_input_gate_out2);
_memory_group.manage(&_input_gate_out1);
- _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1);
- _memory_group.manage(&_input_gate_out2);
- _transpose_input_gate.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
- _memory_group.manage(&_input_gate_out3);
- _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
- _input_gate_out2.allocator()->allocate();
_memory_group.manage(&_input_gate_out4);
- _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
- _input_gate_out3.allocator()->allocate();
- input_gate_out = &_input_gate_out4;
+
+ _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, lstm_params.input_gate_bias(), &_input_gate_out3);
+ _input_gate_out2.allocator()->allocate();
+ input_gate_out = &_input_gate_out3;
+
if(_run_peephole_opt)
{
- _memory_group.manage(&_input_gate_out5);
- _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _memory_group.manage(&_input_gate_out4);
+ _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _accum_input_gate2.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _input_gate_out3.allocator()->allocate();
_input_gate_out4.allocator()->allocate();
- _input_gate_out5.allocator()->allocate();
input_gate_out = &_input_gate_out1;
}
else
@@ -215,35 +225,37 @@
// Configure block that calculates the output
// output_state_out = Activation(input * input_to_output_weights + output_state_in * recurrent_to_output_weights + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
- TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ // We optimize this as follows:
+ // output_state_out = Activation( (input,output_state_in) * (input_to_output_weights, recurrent_to_output_weights) + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
_output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
- _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _output5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _output4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ std::vector<const ITensor *> in_out_weights;
+ in_out_weights.emplace_back(input_to_output_weights);
+ in_out_weights.emplace_back(recurrent_to_output_weights);
+
+ _concat_weights_output.configure(in_out_weights, &_output2);
_memory_group.manage(&_output1);
- _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1);
- _memory_group.manage(&_output2);
- _transpose_output.configure(recurrent_to_output_weights, &_output2);
- _memory_group.manage(&_output3);
- _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
+ _memory_group.manage(&_output4);
+
+ _fully_connected_output.configure(&_forget_gate_out2, &_output2, output_gate_bias, &_output4);
+
_output2.allocator()->allocate();
- _memory_group.manage(&_output5);
- _accum_output1.configure(&_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
- _output3.allocator()->allocate();
- Tensor *output_gate_out = &_output5;
+ _forget_gate_out2.allocator()->allocate();
+
+ Tensor *output_gate_out = &_output4;
if(lstm_params.has_peephole_opt())
{
- _output4.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
+ _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
- _memory_group.manage(&_output4);
- _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _accum_output2.configure(&_output5, &_output4, &_output1, ConvertPolicy::SATURATE);
- _output5.allocator()->allocate();
+ _memory_group.manage(&_output3);
+ _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _accum_output2.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
+ _output4.allocator()->allocate();
output_gate_out = &_output1;
// Allocate intermediate buffers
- _output4.allocator()->allocate();
+ _output3.allocator()->allocate();
}
else
{
@@ -368,10 +380,15 @@
TensorInfo output_gate_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
TensorInfo cell_state_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
+ std::vector<const ITensorInfo *> inputs_vector;
+ inputs_vector.emplace_back(input);
+ inputs_vector.emplace_back(output_state_in);
+ TensorInfo forget_gate_concat;
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector, &forget_gate_concat));
+
// Validate forget gate
ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
@@ -389,9 +406,13 @@
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
+ std::vector<const ITensorInfo *> lstm_weights;
+ lstm_weights.emplace_back(lstm_params.input_to_input_weights());
+ lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
+ TensorInfo lstm_gate_concat;
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(lstm_weights, &lstm_gate_concat));
ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &input_gate, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
@@ -421,9 +442,14 @@
}
// Validate output gate tmp
+ std::vector<const ITensorInfo *> in_out_weights;
+ in_out_weights.emplace_back(input_to_output_weights);
+ in_out_weights.emplace_back(recurrent_to_output_weights);
+ TensorInfo in_out_gate_concat;
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(in_out_weights, &in_out_gate_concat));
+
ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &output_gate_tmp, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
@@ -465,12 +491,12 @@
void NELSTMLayer::run()
{
- _memory_group.acquire();
+ prepare();
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ _concat_inputs_forget_gate.run();
_fully_connected_forget_gate.run();
- NEScheduler::get().schedule(&_transpose_forget_gate, Window::DimY);
- _gemm_forget_gate.run();
- NEScheduler::get().schedule(&_accum_forget_gate1, Window::DimY);
if(_run_peephole_opt)
{
@@ -494,9 +520,7 @@
else
{
_fully_connected_input_gate.run();
- NEScheduler::get().schedule(&_transpose_input_gate, Window::DimY);
- _gemm_input_gate.run();
- NEScheduler::get().schedule(&_accum_input_gate1, Window::DimY);
+
if(_run_peephole_opt)
{
NEScheduler::get().schedule(&_pixelwise_mul_input_gate, Window::DimY);
@@ -520,10 +544,6 @@
}
_fully_connected_output.run();
- NEScheduler::get().schedule(&_transpose_output, Window::DimY);
- _gemm_output.run();
- NEScheduler::get().schedule(&_accum_output1, Window::DimY);
-
if(_run_peephole_opt)
{
NEScheduler::get().schedule(&_pixelwise_mul_output_state1, Window::DimY);
@@ -547,6 +567,18 @@
NEScheduler::get().schedule(&_copy_output, Window::DimY);
_concat_scratch_buffer.run();
+}
- _memory_group.release();
-}
\ No newline at end of file
+void NELSTMLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ _concat_weights_forget_gate.run();
+ if(!_run_cifg_opt)
+ {
+ _concat_weights_input_gate.run();
+ }
+ _concat_weights_output.run();
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index 0e149d4..5174a13 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -92,8 +92,8 @@
// Create Gaussian Pyramid function
_gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
- _convf = arm_compute::support::cpp14::make_unique<NEGaussian5x5[]>(_num_levels);
- _subf = arm_compute::support::cpp14::make_unique<NEArithmeticSubtraction[]>(_num_levels);
+ _convf.resize(_num_levels);
+ _subf.resize(_num_levels);
for(unsigned int i = 0; i < _num_levels; ++i)
{
diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
index 9ad9689..b2d889b 100644
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -64,8 +64,8 @@
_tmp_pyr.init(pyramid_info);
// Allocate add and scale functions. Level 0 does not need to be scaled.
- _addf = arm_compute::support::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
- _scalef = arm_compute::support::cpp14::make_unique<NEScale[]>(num_levels - 1);
+ _addf.resize(num_levels);
+ _scalef.resize(num_levels - 1);
const size_t last_level = num_levels - 1;
@@ -86,7 +86,7 @@
void NELaplacianReconstruct::run()
{
- ARM_COMPUTE_ERROR_ON_MSG(_addf == nullptr, "Unconfigured function");
+ ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function");
const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 80a2541..d08202d 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -168,7 +168,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
// Run input reshaping
NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
@@ -178,8 +178,6 @@
// Reshape output matrix
NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
-
- _memory_group.release();
}
void NELocallyConnectedLayer::prepare()
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index f00114f..d52e928 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,11 +69,9 @@
void NENormalizationLayer::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
NEScheduler::get().schedule(&_border_handler, Window::DimY);
NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index e90d8f6..0df01c6 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -74,10 +74,10 @@
const float pyr_scale = old_pyramid->info()->scale();
- _func_scharr = arm_compute::support::cpp14::make_unique<NEScharr3x3[]>(_num_levels);
- _kernel_tracker = arm_compute::support::cpp14::make_unique<NELKTrackerKernel[]>(_num_levels);
- _scharr_gx = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_levels);
- _scharr_gy = arm_compute::support::cpp14::make_unique<Tensor[]>(_num_levels);
+ _func_scharr.reserve(_num_levels);
+ _kernel_tracker.reserve(_num_levels);
+ _scharr_gx.reserve(_num_levels);
+ _scharr_gy.reserve(_num_levels);
_old_points_internal = LKInternalKeypointArray(old_points->num_values());
_new_points_internal = LKInternalKeypointArray(old_points->num_values());
@@ -95,25 +95,34 @@
TensorInfo tensor_info(TensorShape(width_ith, height_ith), Format::S16);
- _scharr_gx[i].allocator()->init(tensor_info);
- _scharr_gy[i].allocator()->init(tensor_info);
+ auto scharr_gx = support::cpp14::make_unique<Tensor>();
+ auto scharr_gy = support::cpp14::make_unique<Tensor>();
+ scharr_gx->allocator()->init(tensor_info);
+ scharr_gy->allocator()->init(tensor_info);
// Manage intermediate buffers
- _memory_group.manage(_scharr_gx.get() + i);
- _memory_group.manage(_scharr_gy.get() + i);
+ _memory_group.manage(scharr_gx.get());
+ _memory_group.manage(scharr_gy.get());
// Init Scharr kernel
- _func_scharr[i].configure(old_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i, border_mode, constant_border_value);
+ auto func_scharr = support::cpp14::make_unique<NEScharr3x3>();
+ func_scharr->configure(old_ith_input, scharr_gx.get(), scharr_gy.get(), border_mode, constant_border_value);
// Init Lucas-Kanade kernel
- _kernel_tracker[i].configure(old_ith_input, new_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i,
- old_points, new_points_estimates, new_points,
- &_old_points_internal, &_new_points_internal,
- termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
- i, _num_levels, pyr_scale);
+ auto kernel_tracker = support::cpp14::make_unique<NELKTrackerKernel>();
+ kernel_tracker->configure(old_ith_input, new_ith_input, scharr_gx.get(), scharr_gy.get(),
+ old_points, new_points_estimates, new_points,
+ &_old_points_internal, &_new_points_internal,
+ termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
+ i, _num_levels, pyr_scale);
- _scharr_gx[i].allocator()->allocate();
- _scharr_gy[i].allocator()->allocate();
+ scharr_gx->allocator()->allocate();
+ scharr_gy->allocator()->allocate();
+
+ _func_scharr.emplace_back(std::move(func_scharr));
+ _kernel_tracker.emplace_back(std::move(kernel_tracker));
+ _scharr_gx.emplace_back(std::move(scharr_gx));
+ _scharr_gy.emplace_back(std::move(scharr_gy));
}
}
@@ -121,16 +130,14 @@
{
ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
for(unsigned int level = _num_levels; level > 0; --level)
{
// Run Scharr kernel
- _func_scharr[level - 1].run();
+ _func_scharr[level - 1].get()->run();
// Run Lucas-Kanade kernel
- NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX);
+ NEScheduler::get().schedule(_kernel_tracker[level - 1].get(), Window::DimX);
}
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index f5c2718..c608edf 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -25,7 +25,6 @@
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
@@ -61,18 +60,28 @@
return coords;
}
+
+uint32_t last_padding_dimension(const PaddingList &padding)
+{
+ int last_padding_dim = padding.size() - 1;
+ for(; last_padding_dim >= 0; --last_padding_dim)
+ {
+ if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
+ {
+ break;
+ }
+ }
+ return static_cast<uint32_t>(last_padding_dim);
+}
} // namespace
NEPadLayer::NEPadLayer()
- : _memset_kernel(), _copy_kernel(), _output_subtensor()
+ : _copy_kernel(), _mode(), _padding(), _memset_kernel(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results(), _output_subtensor()
{
}
-void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, PixelValue constant_value)
+void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_THROW_ON_ERROR(NEPadLayer::validate(input->info(), output->info(), padding, constant_value));
-
// Auto-init
auto_init_if_empty(*output->info(), get_expected_output_tensorinfo(*input->info(), padding));
@@ -86,23 +95,235 @@
_copy_kernel.configure(input, &_output_subtensor);
}
-Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
+void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *output)
+{
+ // Reflecting can be performed by effectively unfolding the input as follows:
+ // For each dimension starting at DimX:
+ // For before and after:
+ // Use strided slice to extract and reverse the part of the
+ // input / previously produced tensor required for the padding.
+ // Concatenate the before and after padding with the input / previously
+ // produced tensor along the current dimension.
+
+ // Two strided slice functions will be required for each dimension padded as well as a
+ // concatenate function and the tensors to hold the temporary results.
+ _slice_functions.resize(2 * _num_dimensions);
+ _slice_results.resize(2 * _num_dimensions);
+ _concat_functions.resize(_num_dimensions);
+ _concat_results.resize(_num_dimensions - 1);
+
+ Coordinates starts_before{};
+ Coordinates ends_before{};
+ Coordinates starts_after{};
+ Coordinates ends_after{};
+ Coordinates strides{};
+ ITensor *prev = input;
+ for(uint32_t i = 0; i < _num_dimensions; ++i)
+ {
+ // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
+ if(i > 0)
+ {
+ strides.set(i - 1, 1);
+ }
+
+ if(_padding[i].first > 0 || _padding[i].second > 0)
+ {
+ // Set the starts, ends, and strides values for the current dimension.
+ // Due to the bit masks passed to strided slice, the values below the current dimension in
+ // starts and ends will be ignored so do not need to be modified.
+ if(_mode == PaddingMode::REFLECT)
+ {
+ starts_before.set(i, _padding[i].first);
+ ends_before.set(i, 0);
+ starts_after.set(i, input->info()->dimension(i) - 2);
+ ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 2);
+ strides.set(i, -1);
+ }
+ else
+ {
+ starts_before.set(i, _padding[i].first - 1);
+ ends_before.set(i, -1);
+ starts_after.set(i, input->info()->dimension(i) - 1);
+ ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 1);
+ strides.set(i, -1);
+ }
+
+ // Strided slice wraps negative indexes around to the end of the range,
+ // instead this should indicate use of the full range and so the bit mask will be modified.
+ const int32_t begin_mask_before = starts_before[i] < 0 ? ~0 : ~(1u << i);
+ const int32_t end_mask_before = ends_before[i] < 0 ? ~0 : ~(1u << i);
+ const int32_t begin_mask_after = starts_after[i] < 0 ? ~0 : ~(1u << i);
+ const int32_t end_mask_after = ends_after[i] < 0 ? ~0 : ~(1u << i);
+
+ // Reflect the input values for the padding before and after the input.
+ std::vector<ITensor *> concat_vector;
+ if(_padding[i].first > 0)
+ {
+ if(i < prev->info()->num_dimensions())
+ {
+ _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
+ concat_vector.emplace_back(&_slice_results[2 * i]);
+ }
+ else
+ {
+ // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+ concat_vector.push_back(prev);
+ }
+ }
+ concat_vector.push_back(prev);
+ if(_padding[i].second > 0)
+ {
+ if(i < prev->info()->num_dimensions())
+ {
+ _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
+ concat_vector.emplace_back(&_slice_results[2 * i + 1]);
+ }
+ else
+ {
+ // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
+ concat_vector.push_back(prev);
+ }
+ }
+ // Concatenate the padding before and after with the input.
+ ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i];
+ _concat_functions[i].configure(concat_vector, out, i);
+ if(i != _num_dimensions - 1)
+ {
+ _concat_results[i].allocator()->allocate();
+ }
+ prev = out;
+ }
+ _slice_results[2 * i].allocator()->allocate();
+ _slice_results[2 * i + 1].allocator()->allocate();
+ }
+}
+
+void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
+
+ _padding = padding;
+ _mode = mode;
+
+ const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
+
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
+
+ // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
+ _num_dimensions = last_padding_dimension(padding) + 1;
+ if(_num_dimensions > 0)
+ {
+ switch(_mode)
+ {
+ case PaddingMode::CONSTANT:
+ {
+ configure_constant_mode(input, output, padding, constant_value);
+ break;
+ }
+ case PaddingMode::REFLECT:
+ case PaddingMode::SYMMETRIC:
+ {
+ configure_reflect_symmetric_mode(input, output);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Padding mode not supported.");
+ }
+ }
+ else
+ {
+ // Copy the input to the whole output if no padding is applied
+ _copy_kernel.configure(input, output);
+ }
+}
+
+Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
{
ARM_COMPUTE_UNUSED(constant_value);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- auto output_clone = output->clone();
+ const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
- SubTensorInfo output_subtensor_info(output_clone.get(), input->tensor_shape(), get_subtensor_coords(padding), true);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output_clone, padding));
- ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(input, &output_subtensor_info));
+ if(output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+ switch(mode)
+ {
+ case PaddingMode::CONSTANT:
+ {
+ auto output_clone = output->clone();
+ SubTensorInfo output_subtensor_info(output_clone.get(), input->tensor_shape(), get_subtensor_coords(padding), true);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output_clone, padding));
+ ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(input, &output_subtensor_info));
+ break;
+ }
+ case PaddingMode::REFLECT:
+ case PaddingMode::SYMMETRIC:
+ {
+ for(uint32_t i = 0; i < padding.size(); ++i)
+ {
+ if(mode == PaddingMode::REFLECT)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first > input->dimension(i));
+ ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second > input->dimension(i));
+ }
+ }
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Invalid mode");
+ }
+ }
return Status{};
}
void NEPadLayer::run()
{
- NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
- NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+ if(_num_dimensions > 0)
+ {
+ switch(_mode)
+ {
+ case PaddingMode::CONSTANT:
+ {
+ NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+ break;
+ }
+ case PaddingMode::REFLECT:
+ case PaddingMode::SYMMETRIC:
+ {
+ for(uint32_t i = 0; i < _num_dimensions; ++i)
+ {
+ if(_padding[i].first > 0 || _padding[i].second > 0)
+ {
+ if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
+ {
+ _slice_functions[2 * i].run();
+ }
+ if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
+ {
+ _slice_functions[2 * i + 1].run();
+ }
+ _concat_functions[i].run();
+ }
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Padding mode not supported.");
+ }
+ }
+ else
+ {
+ NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+ }
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index cf6b984..ef28fe9 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,8 +29,8 @@
#include <utility>
-using namespace arm_compute;
-
+namespace arm_compute
+{
void NEPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
auto k = arm_compute::support::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
@@ -51,3 +51,27 @@
{
return NEPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
}
+
+void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEComplexPixelWiseMultiplicationKernel>();
+ k->configure(input1, input2, output);
+ _kernel = std::move(k);
+
+ if(output->info()->dimension(0) > 1)
+ {
+ ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
+
+Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return NEComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index 8f7db96..65873b1 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,22 +26,13 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
using namespace arm_compute;
-NEQuantizationLayer::NEQuantizationLayer()
- : _quantize_kernel(), _min_max_kernel(), _min_max()
-{
-}
-
Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-
- TensorInfo min_max{ input->num_channels(), input->data_type() };
- ARM_COMPUTE_RETURN_ON_ERROR(NEMinMaxLayerKernel::validate(input, &min_max));
- ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output, &min_max));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output));
return Status{};
}
@@ -50,24 +41,8 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel
- _min_max_kernel.configure(input, &_min_max);
-
// Configure quantize kernel
- _quantize_kernel.configure(input, output, &_min_max);
-
- // Allocate min_max tensor
- _min_max.allocator()->allocate();
-}
-
-void NEQuantizationLayer::run()
-{
- // Reset min and max
- _min_max_kernel.reset();
-
- // Run min and max kernel
- NEScheduler::get().schedule(&_min_max_kernel, Window::DimY);
-
- // Run quantize kernel
- NEScheduler::get().schedule(&_quantize_kernel, Window::DimY);
+ auto k = arm_compute::support::cpp14::make_unique<NEQuantizationLayerKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index 995d5ee..9ca7ded 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -104,7 +104,7 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
_fully_connected_kernel.run();
@@ -115,8 +115,6 @@
// copy hidden out to output
NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
-
- _memory_group.release();
}
void NERNNLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 014895f..0b145f0 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
+#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
@@ -37,6 +38,8 @@
{
ARM_COMPUTE_UNUSED(keep_dims);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
TensorShape out_shape = input->tensor_shape();
@@ -78,10 +81,10 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
- _reduction_ops = reduction_axis.num_dimensions();
- _reduction_kernels = arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
- _reduced_outs = arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
- _keep_dims = keep_dims;
+ _reduction_ops = reduction_axis.num_dimensions();
+ _reduction_kernels.resize(_reduction_ops);
+ _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+ _keep_dims = keep_dims;
Coordinates axis_local = reduction_axis;
const int input_dims = input->info()->num_dimensions();
@@ -96,9 +99,9 @@
// Perform reduction for every axis
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
- TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+ TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
- auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+ auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
if(i == _reduction_ops - 1 && keep_dims)
{
@@ -107,8 +110,8 @@
else
{
_reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
- _memory_group.manage(_reduced_outs.get() + i);
- _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
+ _memory_group.manage(&_reduced_outs[i]);
+ _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
}
}
@@ -131,13 +134,13 @@
out_shape.remove_dimension(axis_local[i] - i);
}
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
- _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+ _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
}
}
void NEReduceMean::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
@@ -148,5 +151,4 @@
{
_reshape.run();
}
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 9f81a40..a0aed96 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -66,7 +66,8 @@
void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op));
// Configure reduction kernel
_reduction_kernel.configure(input, output, axis, op);
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 483aa4c..425ee6c 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -97,14 +97,17 @@
_dx(),
_dy(),
_scale_kernel(),
- _border_handler()
+ _border_handler(),
+ _use_padding(true)
{
}
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), policy, border_mode, constant_border_value, sampling_policy));
+ ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), policy, border_mode, constant_border_value, sampling_policy, use_padding));
+
+ _use_padding = use_padding;
// Get data layout and width/height indices
const DataLayout data_layout = input->info()->data_layout();
@@ -134,7 +137,7 @@
TensorInfo tensor_info_offsets(shape, Format::S32);
_offsets.allocator()->init(tensor_info_offsets);
- _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_mode, sampling_policy);
+ _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_mode, constant_border_value, sampling_policy, use_padding);
// Allocate once the configure methods have been called
_offsets.allocator()->allocate();
@@ -152,7 +155,7 @@
_dx.allocator()->init(tensor_info_dxdy);
_dy.allocator()->init(tensor_info_dxdy);
- _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_mode, sampling_policy);
+ _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_mode, constant_border_value, sampling_policy, use_padding);
// Allocate once the configure methods have been called
_offsets.allocator()->allocate();
@@ -165,18 +168,20 @@
}
case InterpolationPolicy::AREA:
{
- _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_mode);
+ _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_mode, constant_border_value);
break;
}
default:
ARM_COMPUTE_ERROR("Unsupported interpolation mode");
}
-
- _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
+ if(use_padding)
+ {
+ _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
+ }
}
Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
- BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+ BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
@@ -213,12 +218,15 @@
}
ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(),
- policy, border_mode, sampling_policy));
+ policy, border_mode, constant_border_value, sampling_policy, use_padding));
return Status{};
}
void NEScale::run()
{
- NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+ if(_use_padding)
+ {
+ NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+ }
NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index d8f4eda..2ddfee5 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,10 +81,8 @@
{
NEScheduler::get().schedule(&_border_handler, Window::DimZ);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index 5b6f60b..b47a37a 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,10 +81,8 @@
{
NEScheduler::get().schedule(&_border_handler, Window::DimZ);
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
-
- _memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 36b7d47..79a9496 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -180,7 +180,7 @@
void NESoftmaxLayer::run()
{
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(_needs_flattening)
{
@@ -195,7 +195,5 @@
{
NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
}
-
- _memory_group.release();
}
} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
new file mode 100644
index 0000000..46c28ad
--- /dev/null
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NESpaceToBatchLayer::NESpaceToBatchLayer()
+ : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+{
+}
+
+void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+
+ if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ {
+ _has_padding = true;
+ _memset_kernel.configure(output, PixelValue());
+ }
+ _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+}
+
+void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ {
+ _has_padding = true;
+ _memset_kernel.configure(output, PixelValue());
+ }
+ _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+}
+
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+ return Status{};
+}
+
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+ return Status{};
+}
+
+void NESpaceToBatchLayer::run()
+{
+ // Zero out output only if we have paddings
+ if(_has_padding)
+ {
+ NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+ }
+ NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
index e947657..0373ab6 100644
--- a/src/runtime/NEON/functions/NESplit.cpp
+++ b/src/runtime/NEON/functions/NESplit.cpp
@@ -42,8 +42,8 @@
void NESplit::configure(const ITensor *input, const std::vector<ITensor *> &outputs, unsigned int axis)
{
// Create Slice functions
- _num_outputs = outputs.size();
- _slice_functions = arm_compute::support::cpp14::make_unique<NESlice[]>(_num_outputs);
+ _num_outputs = outputs.size();
+ _slice_functions.resize(_num_outputs);
// Get output shape
const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs);
diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
index 2f49c22..32350b0 100644
--- a/src/runtime/NEON/functions/NEStackLayer.cpp
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp
@@ -43,8 +43,8 @@
void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITensor *output)
{
- _num_inputs = input.size();
- _stack_kernels = arm_compute::support::cpp14::make_unique<NEStackLayerKernel[]>(_num_inputs);
+ _num_inputs = input.size();
+ _stack_kernels.resize(_num_inputs);
// Wrap around negative values
const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
index 7532020..21f35f8 100644
--- a/src/runtime/NEON/functions/NEUnstack.cpp
+++ b/src/runtime/NEON/functions/NEUnstack.cpp
@@ -74,7 +74,7 @@
// Wrap around negative values
const unsigned int axis_u = wrap_axis(axis, input->info());
_num_slices = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
- _strided_slice_vector = arm_compute::support::cpp14::make_unique<NEStridedSlice[]>(_num_slices);
+ _strided_slice_vector.resize(_num_slices);
Coordinates slice_start;
int32_t slice_end_mask;
diff --git a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
index 7e435c3..25b5216 100644
--- a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
@@ -40,14 +40,15 @@
{
}
-Status NEWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+template <typename TensorInfoType, typename>
+inline Status NEWidthConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
// Output auto inizialitation if not yet initialized
TensorInfo tmp_output_info = *output->clone();
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
unsigned int width_offset = 0;
@@ -60,8 +61,8 @@
return Status{};
}
-
-void NEWidthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+template <typename TensorType, typename>
+inline void NEWidthConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output)
{
_num_inputs = inputs_vector.size();
@@ -70,7 +71,7 @@
{
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
}
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
@@ -78,7 +79,7 @@
unsigned int width_offset = 0;
- _concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEWidthConcatenateLayerKernel[]>(_num_inputs);
+ _concat_kernels_vector.resize(_num_inputs);
for(unsigned int i = 0; i < _num_inputs; ++i)
{
@@ -87,10 +88,30 @@
}
}
+void NEWidthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+{
+ configure_internal(std::move(inputs_vector), output);
+}
+
+void NEWidthConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, ITensor *output)
+{
+ configure_internal(std::move(inputs_vector), output);
+}
+
+Status NEWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+{
+ return validate_internal(inputs_vector, output);
+}
+
+Status NEWidthConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+{
+ return validate_internal(inputs_vector, output);
+}
+
void NEWidthConcatenateLayer::run()
{
for(unsigned i = 0; i < _num_inputs; ++i)
{
- NEScheduler::get().schedule(_concat_kernels_vector.get() + i, Window::DimY);
+ NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimY);
}
}
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index e37f8ab..1513786 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
#include "support/ToolchainSupport.h"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp"
namespace arm_compute
{
@@ -162,7 +162,7 @@
const int in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
const int in_batches = input->info()->dimension(3);
- return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
+ return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
}
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
@@ -234,12 +234,12 @@
} //namespace
-NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
: _memory_group(memory_manager), _gemm_function(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
- _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(),
- _is_prepared(false), _is_activationlayer_enabled(false)
+ _permute_input(), _permute_weights(), _permute_output(), _input_transformed(), _output_transformed(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(),
+ _weights_hwio(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false)
{
-} /* arm_compute */
+}
void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
bool enable_fast_math)
@@ -380,20 +380,17 @@
// Kernel Storage
const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
in_channels)
- * data_type_size
- + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+ * data_type_size;
// Input storage
const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
use_same_padding)
- * data_type_size
- + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+ * data_type_size;
// Output storage
const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels,
use_same_padding)
- * data_type_size
- + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+ * data_type_size;
;
const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
@@ -431,14 +428,16 @@
d_strides.set(2, 0);
d_strides.set(3, data_type_size * output_matrix_stride);
- TensorInfo a_info, b_info, d_info;
+ TensorInfo a_info{};
+ TensorInfo b_info{};
+ TensorInfo d_info{};
a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
- _input_workspace.allocator()->init(a_info, storage_alignment);
+ _input_transformed.allocator()->init(a_info, storage_alignment);
_kernel_storage.allocator()->init(b_info, storage_alignment);
- _output_workspace.allocator()->init(d_info, storage_alignment);
+ _output_transformed.allocator()->init(d_info, storage_alignment);
// configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
@@ -446,47 +445,58 @@
1, _output->info()->data_type());
_output_nhwc.allocator()->init(info);
- // Configure the InputTransform
- _memory_group.manage(&_input_workspace);
- _memory_group.manage(&_output_workspace);
+ const ITensor *input_to_use = _input;
+ ITensor *output_to_use = _output;
+ PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
+ const unsigned int max_num_threads = NEScheduler::get().num_threads();
+ // Configure the kernel to transform the input tensor from NCHW -> NHWC
if(data_layout == DataLayout::NCHW)
{
- // configure the kernel to transform the input tensor from NCHW -> NHWC
+ _memory_group.manage(&_input_nhwc);
_permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
- _input_nhwc.allocator()->allocate();
- transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
- &_input_workspace, input_matrix_stride);
-
- // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
- _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
-
- transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-
- //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
- _memory_group.manage(&_output_nhwc);
- transform_output_kernel->configure(biases, &_output_workspace,
- output_matrix_stride, &_output_nhwc,
- in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
- }
- else
- {
- transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
- &_input_workspace, input_matrix_stride);
-
- // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
- _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U));
-
- transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-
- transform_output_kernel->configure(biases, &_output_workspace,
- output_matrix_stride, _output,
- in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
+ input_to_use = &_input_nhwc;
+ weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
}
- _gemm_function.configure(&_input_workspace, &_kernel_storage, nullptr, &_output_workspace, 1.0f, 0.f);
+ // Configure input transform kernel
+ _memory_group.manage(&_input_transformed);
+ _memory_group.manage(&_input_workspace);
+ transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+ &_input_transformed, input_matrix_stride, &_input_workspace);
+ const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
+ TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type());
+ _input_workspace.allocator()->init(input_workspace_info);
_input_workspace.allocator()->allocate();
+ if(data_layout == DataLayout::NCHW)
+ {
+ _input_nhwc.allocator()->allocate();
+ }
+
+ // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+ _permute_weights.configure(weights, &_weights_hwio, weights_permutation_vector);
+ transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
+
+ // Configure GEMM function
+ _memory_group.manage(&_output_transformed);
+ _gemm_function.configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f);
+ _input_transformed.allocator()->allocate();
+
+ // Configure output transform function
+ // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+ if(data_layout == DataLayout::NCHW)
+ {
+ _memory_group.manage(&_output_nhwc);
+ output_to_use = &_output_nhwc;
+ }
+ transform_output_kernel->configure(biases, &_output_transformed,
+ output_matrix_stride, output_to_use,
+ in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels, &_output_workspace);
+ const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
+ TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type());
+ _output_workspace.allocator()->init(output_workspace_info);
_output_workspace.allocator()->allocate();
+ _output_transformed.allocator()->allocate();
// Reorder the convoluted output to ACL's ordering NCHW
if(data_layout == DataLayout::NCHW)
@@ -513,7 +523,7 @@
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
if(data_layout == DataLayout::NCHW)
{
@@ -526,6 +536,7 @@
//Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
_gemm_function.run();
+
// Transform output tensor to the spatial domain
NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
@@ -539,8 +550,6 @@
{
_activationlayer_function.run();
}
-
- _memory_group.release();
}
Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
new file mode 100644
index 0000000..049bf66
--- /dev/null
+++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+namespace
+{
+std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor *input,
+ const ITensor *weights,
+ ITensor *output,
+ PadStrideInfo conv_info,
+ ActivationLayerInfo act_info)
+{
+ const DataType data_type = input->info()->data_type();
+ const TensorShape shape = input->info()->tensor_shape();
+
+ const int n_batches = shape[3];
+ const int in_rows = shape.z();
+ const int in_cols = shape.y();
+ const int n_channels = shape.x();
+ const int padding_top = conv_info.pad_top();
+ const int padding_left = conv_info.pad_left();
+ const int padding_bottom = conv_info.pad_bottom();
+ const int padding_right = conv_info.pad_right();
+
+ const unsigned int stride_x = conv_info.stride().first;
+
+ // Map activation function
+ neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
+ if(arm_compute::utils::info_helpers::is_relu(act_info))
+ {
+ activation = neon_convolution_kernels::ActivationFunction::ReLU;
+ }
+ else if(arm_compute::utils::info_helpers::is_relu6(act_info))
+ {
+ activation = neon_convolution_kernels::ActivationFunction::ReLU6;
+ }
+
+ // Create quantized convolver
+ if(data_type == DataType::QASYMM8)
+ {
+ const QuantizationInfo &input_qinfo = input->info()->quantization_info();
+ const QuantizationInfo &weights_qinfo = weights->info()->quantization_info();
+ const QuantizationInfo &output_qinfo = output->info()->quantization_info();
+
+ // Check that quantization info are in the range [0, 255]
+ ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
+ ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
+ ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
+ const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
+ const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
+ const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
+
+ // Calculate rescale parameters
+ const float fmultipler = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+ int qmultiplier = 0;
+ int qshift = 0;
+ quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
+ qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
+
+ // Create convolver
+ switch(stride_x)
+ {
+ case 1:
+ return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
+ n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+ case 2:
+ return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
+ n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
+ default:
+ return nullptr;
+ }
+ }
+ else
+ {
+ // Create float convolver
+ switch(data_type)
+ {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ {
+ switch(stride_x)
+ {
+ case 1:
+ return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
+ n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+ case 2:
+ return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
+ n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+ default:
+ return nullptr;
+ }
+ break;
+ }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F32:
+ {
+ switch(stride_x)
+ {
+ case 1:
+ return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
+ n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+ case 2:
+ return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
+ n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
+ default:
+ return nullptr;
+ }
+ break;
+ }
+ default:
+ return nullptr;
+ }
+ }
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+NEDepthwiseConvolutionAssemblyDispatch::NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<arm_compute::IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _packed_weights(), _workspace(), _is_prepared(false), _dwc_assembly_kernel(nullptr),
+ _dwc_acl_kernel()
+{
+}
+#endif /* DOXYGEN_SKIP_THIS */
+
+void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *bias,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_UNUSED(depth_multiplier);
+ ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionAssemblyDispatch::validate(input->info(),
+ weights->info(),
+ bias != nullptr ? bias->info() : nullptr,
+ output->info(),
+ conv_info,
+ depth_multiplier,
+ act_info));
+
+ // Output auto inizialitation if not yet initialized
+ const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+
+ _input = input;
+ _weights = weights;
+ _bias = bias;
+ _output = output;
+ _is_prepared = false;
+
+ // Create convolver
+ _dwc_assembly_kernel = create_convolver(input, weights, output, conv_info, act_info);
+ ARM_COMPUTE_ERROR_ON(_dwc_assembly_kernel == nullptr);
+
+ // Create assembly kernel wrapper
+ _dwc_acl_kernel.configure(_dwc_assembly_kernel.get());
+
+ constexpr size_t alignment = 128;
+
+ // Create workspace
+ const unsigned int num_threads = NEScheduler::get().num_threads();
+ const size_t workspace_size = _dwc_assembly_kernel->get_working_space_size(num_threads);
+ ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
+ _workspace.allocator()->init(TensorInfo(TensorShape{ workspace_size }, 1, DataType::S8), alignment);
+ _memory_group.manage(&_workspace);
+ _workspace.allocator()->allocate();
+
+ // Create packing tensor
+ const size_t pack_tensor_size = _dwc_assembly_kernel->get_packed_params_size();
+ ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
+ _packed_weights.allocator()->init(TensorInfo(TensorShape{ pack_tensor_size }, 1, DataType::S8), alignment);
+}
+
+Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+ const auto strides = conv_info.stride();
+ const DataLayout data_layout = input->data_layout();
+ unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != 3 || weights->dimension(height_idx) != 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(!((strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2))));
+ ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier != 1);
+
+ const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
+ const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
+ ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !(is_relu || is_relu6));
+
+ // Check bias
+ if(bias != nullptr)
+ {
+ unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
+ }
+
+ // Check output
+ if(output->total_size() != 0)
+ {
+ const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ PadStrideInfo conv_info,
+ unsigned int depth_multiplier,
+ const Size2D &dilation)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+
+ // Reshape input shape if in NHWC format
+ const DataLayout data_layout = input->data_layout();
+ TensorShape in_shape{ input->tensor_shape() };
+ if(data_layout == DataLayout::NHWC)
+ {
+ in_shape.set(Window::DimX, input->tensor_shape().y());
+ in_shape.set(Window::DimY, input->tensor_shape().z());
+ in_shape.set(Window::DimZ, input->tensor_shape().x());
+ }
+
+ // Check data type
+ const DataType data_type = weights->data_type();
+ bool is_data_type_valid = is_data_type_float(data_type) || is_data_type_quantized_asymmetric(data_type);
+
+ // Check weighs size
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ bool weights_supported = (weights->dimension(width_idx) == 3) && (weights->dimension(height_idx) == 3);
+
+ // Check for supported strides
+ const auto &strides = conv_info.stride();
+ bool supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
+
+ // Check for supported padding
+ const auto pad_top = conv_info.pad_top();
+ const auto pad_right = conv_info.pad_right();
+ const auto pad_bottom = conv_info.pad_bottom();
+ const auto pad_left = conv_info.pad_left();
+ PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(3U, 3U), conv_info);
+ bool is_same_padding = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
+ bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
+ bool supported_padding = is_same_padding || is_valid_padding;
+ bool is_dilation_1 = dilation.x() == 1 && dilation.y() == 1;
+
+ return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_1;
+}
+
+void NEDepthwiseConvolutionAssemblyDispatch::run()
+{
+ // Prepare assembly kernel
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Setup inputs/outputs
+ ARM_COMPUTE_ERROR_ON(_workspace.buffer() == nullptr);
+ _dwc_assembly_kernel->set_working_space(static_cast<void *>(_workspace.buffer()));
+
+ ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+ const int input_element_size = _input->info()->element_size();
+ const int input_batch_stride = _input->info()->strides_in_bytes()[3] / input_element_size;
+ const int input_row_stride = _input->info()->strides_in_bytes().z() / input_element_size;
+ const int input_col_stride = _input->info()->strides_in_bytes().y() / input_element_size;
+ const void *input_ptr = _input->buffer() + _input->info()->offset_first_element_in_bytes();
+ _dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
+
+ ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+ const int output_element_size = _output->info()->element_size();
+ const int output_batch_stride = _output->info()->strides_in_bytes()[3] / output_element_size;
+ const int output_row_stride = _output->info()->strides_in_bytes().z() / output_element_size;
+ const int output_col_stride = _output->info()->strides_in_bytes().y() / output_element_size;
+ void *output_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes();
+ _dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
+
+ // Schedule assembly kernel
+ NEScheduler::get().schedule(&_dwc_acl_kernel, Window::DimX);
+}
+
+void NEDepthwiseConvolutionAssemblyDispatch::prepare()
+{
+ if(!_is_prepared)
+ {
+ _packed_weights.allocator()->allocate();
+ ARM_COMPUTE_ERROR_ON(_packed_weights.buffer() == nullptr);
+
+ // Pack weights and bias
+ const int weights_element_size = _weights->info()->element_size();
+ const int weights_row_stride = _weights->info()->strides_in_bytes().z() / weights_element_size;
+ const int weights_col_stride = _weights->info()->strides_in_bytes().y() / weights_element_size;
+ _dwc_assembly_kernel->pack_params(_packed_weights.buffer(),
+ _weights->buffer() + _weights->info()->offset_first_element_in_bytes(),
+ weights_row_stride,
+ weights_col_stride,
+ (_bias != nullptr) ? _bias->buffer() : nullptr);
+ _dwc_assembly_kernel->set_packed_params_buffer(_packed_weights.buffer());
+
+ _weights->mark_as_unused();
+ if(_bias != nullptr)
+ {
+ _bias->mark_as_unused();
+ }
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
index 34aaea0..e207ab0 100644
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
@@ -183,9 +183,8 @@
{
prepare();
- _memory_group.acquire();
+ MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().run_tagged_workloads(_workloads, _tag.c_str());
- _memory_group.release();
}
void NEGEMMInterleavedWrapper::prepare()