arm_compute v19.11
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index 6af71a3..1b86514 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,10 +24,15 @@
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
+NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) // NOLINT
+ : INESimpleFunctionNoBorder(ctx)
+{
+}
void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
{
auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernel>();
@@ -39,3 +44,4 @@
{
return NEActivationLayerKernel::validate(input, output, act_info);
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index 6863bb0..a23061e 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -29,41 +29,28 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
namespace arm_compute
{
NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _reduction_kernel(), _fill_border_kernel(), _run_fill_border(false)
+ : _reduction_function(support::cpp14::make_unique<NEReductionOperation>())
{
+ ARM_COMPUTE_UNUSED(memory_manager);
}
void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op)
{
- _reduction_kernel.configure(input, output, axis, op);
-
- if(axis == 0)
- {
- _fill_border_kernel.configure(input, _reduction_kernel.border_size(), BorderMode::REPLICATE);
- _run_fill_border = true;
- }
+ _reduction_function->configure(input, output, axis, op, false);
}
Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
- ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
- return Status{};
+ return NEReductionOperation::validate(input, output, axis, op, false);
}
void NEArgMinMaxLayer::run()
{
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- if(_run_fill_border)
- {
- NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
- }
- NEScheduler::get().schedule(&_reduction_kernel, Window::DimY);
+ _reduction_function->run();
}
} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
new file mode 100644
index 0000000..818e228
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info)
+{
+ // Configure Bounding Box kernel
+ auto k = arm_compute::support::cpp14::make_unique<NEBoundingBoxTransformKernel>();
+ k->configure(boxes, pred_boxes, deltas, info);
+ _kernel = std::move(k);
+}
+
+Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+{
+ return NEBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 032e617..3d5fbfb 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -37,6 +37,7 @@
#include "support/ToolchainSupport.h"
#include <cstring>
+#include <inttypes.h>
#include <utility>
using namespace arm_compute;
@@ -118,7 +119,7 @@
}
else
{
- ARM_COMPUTE_ERROR("Gradient size %d not supported\n", gradient_size);
+ ARM_COMPUTE_ERROR_VAR("Gradient size %+" PRId32 " not supported\n", gradient_size);
}
// Manage intermediate buffers
diff --git a/src/runtime/NEON/functions/NEComputeAllAnchors.cpp b/src/runtime/NEON/functions/NEComputeAllAnchors.cpp
new file mode 100644
index 0000000..7702fb0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEComputeAllAnchors.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEComputeAllAnchors::configure(const ITensor *anchors, ITensor *all_anchors, const ComputeAnchorsInfo &info)
+{
+ // Configure ComputeAllAnchors kernel
+ auto k = arm_compute::support::cpp14::make_unique<NEComputeAllAnchorsKernel>();
+ k->configure(anchors, all_anchors, info);
+ _kernel = std::move(k);
+}
+
+Status NEComputeAllAnchors::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+{
+ return NEComputeAllAnchorsKernel::validate(anchors, all_anchors, info);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
index b5b159a..f65c035 100644
--- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,8 +23,8 @@
*/
#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
: _kernel()
{
@@ -46,3 +46,4 @@
{
NEScheduler::get().schedule(&_kernel, Window::DimZ);
}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 1f2cc3d..0411b41 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -64,13 +64,8 @@
const unsigned int height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
- ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
-
- auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx),
- info.pad().first, info.pad().second, stride_x, stride_y);
+ auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx), info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
if(bias != nullptr)
@@ -96,9 +91,11 @@
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
}
- unsigned int padx = 0;
- unsigned int pady = 0;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, padx, pady);
+ unsigned int deconv_pad_x = 0;
+ unsigned int deconv_pad_y = 0;
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
@@ -126,14 +123,17 @@
_is_prepared = false;
_is_nchw = data_layout == DataLayout::NCHW;
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
+ const unsigned int pad_left = info.pad_left();
+ const unsigned int pad_right = info.pad_right();
+ const unsigned int pad_top = info.pad_top();
+ const unsigned int pad_bottom = info.pad_bottom();
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- auto out_dims = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx), weights->info()->dimension(width_idx),
- weights->info()->dimension(height_idx),
- info.pad().first, info.pad().second, stride_x, stride_y);
+ auto out_dims = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+ weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info);
const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
// Output auto initialization if not yet initialized
@@ -157,16 +157,30 @@
_permuted_weights.info()->set_data_layout(DataLayout::NCHW);
// Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
- unsigned int padx = 0;
- unsigned int pady = 0;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*_permuted_input.info(), *_permuted_weights.info(), stride_x, stride_y, out_dims, padx,
- pady);
+ unsigned int deconv_pad_x = 0;
+ unsigned int deconv_pad_y = 0;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*_permuted_input.info(), *_permuted_weights.info(), stride_x, stride_y, out_dims,
+ deconv_pad_x, deconv_pad_y);
+
+ unsigned int deconv_pad_left = pad_right > pad_left ? pad_right - pad_left : 0;
+ unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
+ deconv_pad_x -= deconv_pad_left + deconv_pad_right;
+ ARM_COMPUTE_ERROR_ON((deconv_pad_x % 2) != 0);
+ deconv_pad_left += deconv_pad_x / 2;
+ deconv_pad_right += deconv_pad_x / 2;
+
+ unsigned int deconv_pad_top = pad_bottom > pad_top ? pad_bottom - pad_top : 0;
+ unsigned int deconv_pad_bottom = pad_top > pad_bottom ? pad_top - pad_bottom : 0;
+ deconv_pad_y -= deconv_pad_top + deconv_pad_bottom;
+ ARM_COMPUTE_ERROR_ON((deconv_pad_y % 2) != 0);
+ deconv_pad_top += deconv_pad_y / 2;
+ deconv_pad_bottom += deconv_pad_y / 2;
TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(), _permuted_input.info()->quantization_info());
scale_out_info.set_data_layout(DataLayout::NCHW);
_scaled_output.allocator()->init(scale_out_info);
- const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
+ const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
_upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info);
_weights_flipped.allocator()->init(*_permuted_weights.info()->clone());
@@ -189,14 +203,30 @@
else
{
// Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
- unsigned int padx = 0;
- unsigned int pady = 0;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, padx, pady);
+ unsigned int deconv_pad_x = 0;
+ unsigned int deconv_pad_y = 0;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y,
+ out_dims, deconv_pad_x, deconv_pad_y);
+
+ unsigned int deconv_pad_left = pad_right > pad_left ? pad_right - pad_left : 0;
+ unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
+ deconv_pad_x -= deconv_pad_left + deconv_pad_right;
+ ARM_COMPUTE_ERROR_ON((deconv_pad_x % 2) != 0);
+ deconv_pad_left += deconv_pad_x / 2;
+ deconv_pad_right += deconv_pad_x / 2;
+
+ unsigned int deconv_pad_top = pad_bottom > pad_top ? pad_bottom - pad_top : 0;
+ unsigned int deconv_pad_bottom = pad_top > pad_bottom ? pad_top - pad_bottom : 0;
+ deconv_pad_y -= deconv_pad_top + deconv_pad_bottom;
+ ARM_COMPUTE_ERROR_ON((deconv_pad_y % 2) != 0);
+ deconv_pad_top += deconv_pad_y / 2;
+ deconv_pad_bottom += deconv_pad_y / 2;
TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
scale_out_info.set_data_layout(data_layout);
_scaled_output.allocator()->init(scale_out_info);
- const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
+
+ const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
_upsample_f.configure(input, &_scaled_output, upsample_info);
_weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index fbdee84..6cf7b97 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -23,548 +23,20 @@
*/
#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/ToolchainSupport.h"
-
-#include "arm_compute/core/utils/misc/InfoHelpers.h"
using namespace arm_compute::misc;
using namespace arm_compute::misc::shape_calculator;
namespace arm_compute
{
-NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
- _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
- _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+namespace
{
-}
-
-void NEDepthwiseConvolutionLayer3x3::configure_generic(ITensor *input,
- const ITensor *weights,
- const ITensor *biases,
- ITensor *output,
- const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info,
- const Size2D &dilation)
-{
- ARM_COMPUTE_UNUSED(act_info);
-
- PixelValue zero_value(0.f);
-
- // Initialize the intermediate accumulator tensor in case of quantized input
- if(_is_quantized)
- {
- TensorShape accum_shape = output->info()->tensor_shape();
- DataLayout accum_layout = output->info()->data_layout();
- if(!_is_nchw)
- {
- permute(accum_shape, PermutationVector(1U, 2U, 0U));
- accum_layout = DataLayout::NCHW;
- }
-
- _memory_group.manage(&_accumulator);
- _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
- _accumulator.info()->set_data_layout(accum_layout);
- zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
- }
-
- if(!_is_nchw)
- {
- _memory_group.manage(&_permuted_input);
- _memory_group.manage(&_permuted_output);
-
- // Configure the function to transform the input tensor from NHWC -> NCHW
- _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
- _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-
- // Configure the function to transform the weights tensor from HWI -> IHW
- _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
- _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
- _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
-
- // Configure depthwise
- _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
-
- // Configure border handler
- _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-
- // Allocate tensors
- _permuted_input.allocator()->allocate();
- }
- else
- {
- // Configure depthwise convolution kernel
- _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
-
- // Configure border handler
- _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
- }
-
- // Configure biases accumulation
- if(_is_quantized)
- {
- const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform();
-
- float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
- int output_multiplier;
- int output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, oq_info.offset);
- _accumulator.allocator()->allocate();
- }
- else if(_has_bias)
- {
- _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
- }
-
- // Permute output
- if(!_is_nchw)
- {
- // Configure the function to transform the convoluted output to NHWC
- _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
- _permuted_output.allocator()->allocate();
- }
-}
-
-void NEDepthwiseConvolutionLayer3x3::configure_optimized(const ITensor *input,
- const ITensor *weights,
- const ITensor *biases,
- ITensor *output,
- const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info)
-{
- ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
- const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
- const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
- _is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
- if(!_is_activationlayer_enabled)
- {
- act_info_to_use = act_info;
- }
-
- if(_is_nchw)
- {
- _memory_group.manage(&_permuted_input);
- _memory_group.manage(&_permuted_output);
-
- // Configure the function to transform the input tensor from NCHW -> NHWC
- _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
- _permuted_input.info()->set_data_layout(DataLayout::NHWC);
-
- // Configure the function to transform the weights tensor from IHW -> HWI
- _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
- _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
-
- _permuted_output.info()->set_data_layout(DataLayout::NHWC);
- _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
-
- // Configure optimized depthwise
- _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use);
-
- // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
- _permuted_output.info()->set_data_layout(DataLayout::NHWC);
- _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
-
- // Allocate tensors
- _permuted_input.allocator()->allocate();
- _permuted_output.allocator()->allocate();
- }
- else
- {
- _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use);
- }
-}
-
-void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input,
- const ITensor *weights,
- const ITensor *biases,
- ITensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info,
- const Size2D &dilation)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer3x3::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
- output->info(), conv_info, depth_multiplier, act_info, dilation));
-
- _original_weights = weights;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _has_bias = biases != nullptr;
- _is_optimized = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
- weights->info(),
- conv_info,
- depth_multiplier, dilation);
- _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
- _permute = _is_optimized == _is_nchw;
- _is_prepared = false;
- _is_activationlayer_enabled = act_info.enabled();
-
- // Configure appropriate pipeline
- if(_is_optimized)
- {
- configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info);
- }
- else
- {
- configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
- }
-
- // Configure activation
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(output, nullptr, act_info);
- }
-}
-
-Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input,
- const ITensorInfo *weights,
- const ITensorInfo *biases,
- const ITensorInfo *output,
- const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info,
- const Size2D &dilation)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
- const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
-
- if(biases != nullptr)
- {
- const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
- }
-
- if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
- {
- const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
-
- if(is_quantized)
- {
- const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
-
- float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
- int output_multiplier;
- int output_shift;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output, output_multiplier, output_shift, oq_info.offset));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier));
- }
-
- //Validate Activation Layer
- if(act_info.enabled())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
- }
-
- return Status{};
-}
-
-void NEDepthwiseConvolutionLayer3x3::run_generic()
-{
- // Fill border
- NEScheduler::get().schedule(&_border_handler, Window::DimX);
-
- // Execute depthwise convolution
- NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
-
- // Add biases
- if(_has_bias || _is_quantized)
- {
- NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
- }
-
- // Permute output
- if(!_is_nchw)
- {
- _permute_output.run();
- }
-}
-
-void NEDepthwiseConvolutionLayer3x3::run_optimized()
-{
- // Run assembly function
- _dwc_optimized_func.run();
-
- // Permute output
- if(_is_nchw)
- {
- _permute_output.run();
- }
-}
-
-void NEDepthwiseConvolutionLayer3x3::run()
-{
- prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- // Permute input
- if(_permute)
- {
- _permute_input.run();
- }
-
- _is_optimized ? run_optimized() : run_generic();
-
- // Run activation
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.run();
- }
-}
-
-void NEDepthwiseConvolutionLayer3x3::prepare()
-{
- if(!_is_prepared)
- {
- // Permute weights
- if(_permute)
- {
- _permuted_weights.allocator()->allocate();
- _permute_weights.run();
- _original_weights->mark_as_unused();
- }
-
- // Prepare optimized function
- if(_is_optimized)
- {
- _dwc_optimized_func.prepare();
- if(!_permuted_weights.is_used())
- {
- _permuted_weights.allocator()->free();
- }
- }
-
- _is_prepared = true;
- }
-}
-
-NEDepthwiseConvolutionLayerOptimized::NEDepthwiseConvolutionLayerOptimized(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
- _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
- _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
-{
-}
-
-void NEDepthwiseConvolutionLayerOptimized::configure_generic(ITensor *input,
- const ITensor *weights,
- const ITensor *biases,
- ITensor *output,
- const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info,
- const Size2D &dilation)
-{
- ARM_COMPUTE_UNUSED(act_info);
-
- PixelValue zero_value(0.f);
-
- // Initialize the intermediate accumulator tensor in case of quantized input
- if(_is_quantized)
- {
- TensorShape accum_shape = output->info()->tensor_shape();
- DataLayout accum_layout = output->info()->data_layout();
- if(!_is_nchw)
- {
- permute(accum_shape, PermutationVector(1U, 2U, 0U));
- accum_layout = DataLayout::NCHW;
- }
-
- _memory_group.manage(&_accumulator);
- _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
- _accumulator.info()->set_data_layout(accum_layout);
- zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
- }
-
- if(!_is_nchw)
- {
- _memory_group.manage(&_permuted_input);
- _memory_group.manage(&_permuted_output);
-
- // Configure the function to transform the input tensor from NHWC -> NCHW
- _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
- _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-
- // Configure the function to transform the weights tensor from HWI -> IHW
- _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
- _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
- _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
-
- // Configure depthwise
- _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
-
- // Configure border handler
- _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-
- // Allocate tensors
- _permuted_input.allocator()->allocate();
- }
- else
- {
- // Configure depthwise convolution kernel
- _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
-
- // Configure border handler
- _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
- }
-
- // Configure biases accumulation
- if(_is_quantized)
- {
- const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform();
-
- float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
- int output_multiplier;
- int output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, oq_info.offset);
- _accumulator.allocator()->allocate();
- }
- else if(_has_bias)
- {
- _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
- }
-
- // Permute output
- if(!_is_nchw)
- {
- // Configure the function to transform the convoluted output to NHWC
- _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
- _permuted_output.allocator()->allocate();
- }
-}
-
-void NEDepthwiseConvolutionLayerOptimized::configure_optimized(const ITensor *input,
- const ITensor *weights,
- const ITensor *biases,
- ITensor *output,
- const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info,
- const Size2D &dilation)
-{
- ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
- const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
- const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
- _is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
- if(!_is_activationlayer_enabled)
- {
- act_info_to_use = act_info;
- }
-
- if(_is_nchw)
- {
- _memory_group.manage(&_permuted_input);
- _memory_group.manage(&_permuted_output);
-
- // Configure the function to transform the input tensor from NCHW -> NHWC
- _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
- _permuted_input.info()->set_data_layout(DataLayout::NHWC);
-
- // Configure the function to transform the weights tensor from IHW -> HWI
- _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
- _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
-
- _permuted_output.info()->set_data_layout(DataLayout::NHWC);
- _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
-
- // Configure optimized depthwise
- _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use, dilation);
-
- // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
- _permuted_output.info()->set_data_layout(DataLayout::NHWC);
- _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
-
- // Allocate tensors
- _permuted_input.allocator()->allocate();
- _permuted_output.allocator()->allocate();
- }
- else
- {
- _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use, dilation);
- }
-}
-
-void NEDepthwiseConvolutionLayerOptimized::configure(ITensor *input,
- const ITensor *weights,
- const ITensor *biases,
- ITensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info,
- const Size2D &dilation)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayerOptimized::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
- output->info(), conv_info, depth_multiplier, act_info, dilation));
-
- _original_weights = weights;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _has_bias = biases != nullptr;
- _is_optimized = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
- weights->info(),
- conv_info,
- depth_multiplier,
- dilation);
- _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
- _permute = _is_optimized == _is_nchw;
- _is_prepared = false;
- _is_activationlayer_enabled = act_info.enabled();
-
- // Configure appropriate pipeline
- if(_is_optimized)
- {
- configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
- }
- else
- {
- configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
- }
-
- // Configure activation
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(output, nullptr, act_info);
- }
-}
-
-Status NEDepthwiseConvolutionLayerOptimized::validate(const ITensorInfo *input,
- const ITensorInfo *weights,
- const ITensorInfo *biases,
- const ITensorInfo *output,
- const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info,
- const Size2D &dilation)
+Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -619,8 +91,244 @@
return Status{};
}
+} // namespace
-void NEDepthwiseConvolutionLayerOptimized::run_generic()
+NEDepthwiseConvolutionLayerOptimized::NEDepthwiseConvolutionLayerOptimized(std::shared_ptr<IMemoryManager> memory_manager)
+ : _func(std::move(memory_manager))
+{
+}
+
+void NEDepthwiseConvolutionLayerOptimized::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
+{
+ _func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
+
+Status NEDepthwiseConvolutionLayerOptimized::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+{
+ return validate_arguments_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
+
+void NEDepthwiseConvolutionLayerOptimized::run()
+{
+ _func.run();
+}
+
+void NEDepthwiseConvolutionLayerOptimized::prepare()
+{
+ _func.prepare();
+}
+
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
+ _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
+ _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+{
+}
+
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure_generic(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
+{
+ ARM_COMPUTE_UNUSED(act_info);
+
+ PixelValue zero_value(0.f);
+
+ // Initialize the intermediate accumulator tensor in case of quantized input
+ if(_is_quantized)
+ {
+ TensorShape accum_shape = output->info()->tensor_shape();
+ DataLayout accum_layout = output->info()->data_layout();
+ if(!_is_nchw)
+ {
+ permute(accum_shape, PermutationVector(1U, 2U, 0U));
+ accum_layout = DataLayout::NCHW;
+ }
+
+ _memory_group.manage(&_accumulator);
+ _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
+ _accumulator.info()->set_data_layout(accum_layout);
+ zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
+ }
+
+ if(!_is_nchw)
+ {
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+ // Configure the function to transform the weights tensor from HWI -> IHW
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+ _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+
+ // Configure depthwise
+ _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
+
+ // Configure border handler
+ _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+ // Allocate tensors
+ _permuted_input.allocator()->allocate();
+ }
+ else
+ {
+ // Configure depthwise convolution kernel
+ _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
+
+ // Configure border handler
+ _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+ }
+
+ // Configure biases accumulation
+ if(_is_quantized)
+ {
+ const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform();
+
+ float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
+ int output_multiplier;
+ int output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, oq_info.offset);
+ _accumulator.allocator()->allocate();
+ }
+ else if(_has_bias)
+ {
+ _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
+ }
+
+ // Permute output
+ if(!_is_nchw)
+ {
+ // Configure the function to transform the convoluted output to NHWC
+ _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+ _permuted_output.allocator()->allocate();
+ }
+}
+
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure_optimized(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
+{
+ ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
+ const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
+ const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
+ _is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
+ if(!_is_activationlayer_enabled)
+ {
+ act_info_to_use = act_info;
+ }
+
+ if(_is_nchw)
+ {
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NCHW -> NHWC
+ _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+ _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+
+ // Configure the function to transform the weights tensor from IHW -> HWI
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+
+ _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+ _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+
+ // Configure optimized depthwise
+ _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use, dilation);
+
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+ _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+
+ // Allocate tensors
+ _permuted_input.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
+ }
+ else
+ {
+ _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use, dilation);
+ }
+}
+
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
+ output->info(), conv_info, depth_multiplier, act_info, dilation));
+
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _has_bias = biases != nullptr;
+ _is_optimized = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
+ weights->info(),
+ conv_info,
+ depth_multiplier,
+ dilation);
+ _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
+ _permute = _is_optimized == _is_nchw;
+ _is_prepared = false;
+ _is_activationlayer_enabled = act_info.enabled();
+
+ // Configure appropriate pipeline
+ if(_is_optimized)
+ {
+ configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ }
+ else
+ {
+ configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ }
+
+ // Configure activation
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
+}
+
+Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
+{
+ return validate_arguments_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
+
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run_generic()
{
// Fill border
NEScheduler::get().schedule(&_border_handler, Window::DimX);
@@ -641,7 +349,7 @@
}
}
-void NEDepthwiseConvolutionLayerOptimized::run_optimized()
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run_optimized()
{
// Run assembly function
_dwc_optimized_func.run();
@@ -653,7 +361,7 @@
}
}
-void NEDepthwiseConvolutionLayerOptimized::run()
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
{
prepare();
@@ -674,7 +382,7 @@
}
}
-void NEDepthwiseConvolutionLayerOptimized::prepare()
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
{
if(!_is_prepared)
{
@@ -700,262 +408,84 @@
}
}
-NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
- : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _depthwise_conv_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _fill_border(), _v2mm_input_fill_border(),
- _v2mm_weights_fill_border(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(),
- _permuted_input(), _permuted_weights(), _permuted_output(), _is_prepared(false), _is_quantized(false), _is_nhwc(false), _is_activationlayer_enabled(false), _is_optimized(false),
- _original_weights(nullptr)
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
+ : _depthwise_conv_kernel(), _fill_border(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _permuted_input(), _permuted_weights(), _permuted_output(),
+ _is_prepared(false), _is_nchw(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
{
}
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- // Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
output->info(), conv_info, depth_multiplier, act_info, dilation));
- _is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
- _is_optimized = _is_nhwc && input->info()->data_type() == DataType::F32;
+ _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
+ _is_prepared = !_is_nchw;
- if(!_is_optimized)
+ ITensor *input_to_use = input;
+ const ITensor *weights_to_use = weights;
+ ITensor *output_to_use = output;
+ if(_is_nchw)
{
- ITensor *input_to_use = input;
- const ITensor *weights_to_use = weights;
- ITensor *output_to_use = output;
+ _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+ _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+ input_to_use = &_permuted_input;
- if(_is_nhwc)
- {
- _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
- _permuted_input.info()->set_data_layout(DataLayout::NCHW);
- input_to_use = &_permuted_input;
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+ weights_to_use = &_permuted_weights;
- _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
- _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
- weights_to_use = &_permuted_weights;
- }
-
- const size_t weights_w = weights_to_use->info()->dimension(0);
- const size_t weights_h = weights_to_use->info()->dimension(1);
- const size_t weights_z = weights_to_use->info()->dimension(2);
-
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _is_prepared = false;
- _original_weights = weights_to_use;
-
- // Should bias be appended ?
- bool append_bias = (biases != nullptr) && !_is_quantized;
-
- // Calculate output shape
- TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
- if(_is_nhwc)
- {
- permute(output_shape, PermutationVector(1U, 2U, 0U));
- _permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
- _permuted_output.info()->set_data_layout(DataLayout::NCHW);
- _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
- output_to_use = &_permuted_output;
- }
-
- // Output width and height
- const unsigned int conv_w = output_shape.x();
- const unsigned int conv_h = output_shape.y();
-
- // Set up intermediate tensors
- const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
- const size_t conv_size = conv_w * conv_h;
-
- // Im2Col configuration
- TensorShape shape_im2col = input_to_use->info()->tensor_shape();
- shape_im2col.set(0, patch_size);
- shape_im2col.set(1, conv_size);
- shape_im2col.set(2, weights_z);
- _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
- _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
-
- // Weights reshape configuration
- const TensorShape shape_weights_reshape(patch_size, weights_z);
- _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape).set_data_layout(DataLayout::NCHW));
- _weights_reshape_kernel.configure(weights_to_use, &_weights_reshaped, append_bias ? biases : nullptr);
-
- // GEMV configuration
- DataType v2mm_dt = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
- TensorShape shape_v2mm_out = input_to_use->info()->tensor_shape();
- shape_v2mm_out.set(0, conv_size * weights_z);
- shape_v2mm_out.set(1, 1);
- shape_v2mm_out.set(2, 1);
- _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out).set_data_layout(DataLayout::NCHW));
- _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
- _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
- _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output_to_use, conv_w, conv_h);
-
- // Output staged configuration
- if(_is_quantized)
- {
- const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
- float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
- int output_multiplier;
- int output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_output_reshaped, biases, output_to_use, output_multiplier, output_shift, oq_info.offset);
- _output_reshaped.allocator()->allocate();
- }
-
- if(_is_nhwc)
- {
- _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
-
- _permuted_input.allocator()->allocate();
- _permuted_weights.allocator()->allocate();
- _permuted_output.allocator()->allocate();
- }
-
- // Fill borders on inputs
- PixelValue zero_in(static_cast<int32_t>(0));
- PixelValue zero_w(static_cast<int32_t>(0));
- if(_is_quantized)
- {
- zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().uniform().offset));
- zero_w = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().uniform().offset));
- }
- BorderSize border_size = _v2mm_kernel.border_size();
- _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
-
- border_size.bottom = 0;
- _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
-
- // Allocate intermediate tensors
- _input_reshaped.allocator()->allocate();
- _v2mm_output.allocator()->allocate();
+ _permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+ output_to_use = &_permuted_output;
}
- else
- {
- // Configure kernel
- _depthwise_conv_kernel.configure(input, weights, biases, output, conv_info, depth_multiplier, dilation);
+ _original_weights = weights_to_use;
- // Fill input borders
- _fill_border.configure(input, _depthwise_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint64_t>(0), input->info()->data_type()));
+ _depthwise_conv_kernel.configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, dilation);
+ _fill_border.configure(input_to_use, _depthwise_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()));
+
+ if(_is_nchw)
+ {
+ _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+ _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+
+ _permuted_input.allocator()->allocate();
+ _permuted_weights.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
}
//Configure Activation Layer
_is_activationlayer_enabled = act_info.enabled();
-
if(_is_activationlayer_enabled)
{
_activationlayer_function.configure(output, nullptr, act_info);
}
}
-Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
-
- const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) + (weights->dimension(width_idx) - 1) * (dilation.x() - 1) > input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right());
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) + (weights->dimension(height_idx) - 1) * (dilation.y() - 1) > input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom());
- ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) * depth_multiplier) != weights->dimension(channel_idx));
-
- if(input->data_layout() != DataLayout::NHWC || input->data_type() != DataType::F32)
+ if(input->data_layout() == DataLayout::NCHW)
{
- // Clone output to use auto init
- auto output_clone = output->clone();
-
- const ITensorInfo *input_to_use = input;
- const ITensorInfo *weights_to_use = weights;
- const ITensorInfo *output_to_use = output_clone.get();
-
TensorShape permuted_input_shape = input->tensor_shape();
TensorShape permuted_weights_shape = weights->tensor_shape();
- TensorInfo permuted_input;
- TensorInfo permuted_weights;
+ TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+ permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
+ permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
+ permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
- if(input->data_layout() == DataLayout::NHWC)
- {
- permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
- permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
+ const TensorInfo permuted_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
+ const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
+ const TensorInfo permuted_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
- permuted_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW));
- permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
- input_to_use = &permuted_input;
- weights_to_use = &permuted_weights;
- }
-
- const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- const bool append_bias = (biases != nullptr) && !is_quantized;
- TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
- const size_t weights_w = weights_to_use->dimension(0);
- const size_t weights_h = weights_to_use->dimension(1);
- const size_t weights_z = weights_to_use->dimension(2);
- const unsigned int conv_w = output_shape[width_idx];
- const unsigned int conv_h = output_shape[height_idx];
- const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
- const size_t conv_size = conv_w * conv_h;
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output_clone, input->clone()->set_tensor_shape(output_shape));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-
- TensorInfo permuted_output;
- if(input->data_layout() == DataLayout::NHWC)
- {
- permute(output_shape, PermutationVector(1U, 2U, 0U));
- permuted_output = TensorInfo(output_clone->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_layout(DataLayout::NCHW));
- output_to_use = &permuted_output;
- }
-
- // Im2Col configuration
- TensorShape shape_im2col = input_to_use->tensor_shape();
- shape_im2col.set(0, patch_size);
- shape_im2col.set(1, conv_size);
- shape_im2col.set(2, weights_z);
- TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
-
- // Weights reshape configuration
- const TensorShape shape_weights_reshape(patch_size, weights_z);
- TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape).set_data_layout(DataLayout::NCHW));
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseWeightsReshapeKernel::validate(weights_to_use, &weights_reshaped, append_bias ? biases : nullptr));
-
- // GEMV configuration
- DataType v2mm_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
- TensorShape shape_v2mm_out = input_to_use->tensor_shape();
- shape_v2mm_out.set(0, conv_size * weights_z);
- shape_v2mm_out.set(1, 1);
- shape_v2mm_out.set(2, 1);
- TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out).set_data_layout(DataLayout::NCHW));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
-
- TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_to_use->tensor_shape()));
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output_to_use, conv_w, conv_h));
-
- if(is_quantized)
- {
- const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
-
- float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
- int output_multiplier;
- int output_shift;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output_to_use, output_multiplier, output_shift, oq_info.offset));
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, dilation));
}
else
{
@@ -971,35 +501,20 @@
return Status{};
}
-void NEDepthwiseConvolutionLayer::run()
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
{
- if(!_is_optimized)
+ if(_is_nchw)
{
prepare();
-
- if(_is_nhwc)
- {
- _permute_input.run();
- }
-
- NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
- NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX);
- NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
- NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
- if(_is_quantized)
- {
- NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
- }
-
- if(_is_nhwc)
- {
- _permute_output.run();
- }
+ _permute_input.run();
}
- else
+
+ NEScheduler::get().schedule(&_fill_border, Window::DimX);
+ NEScheduler::get().schedule(&_depthwise_conv_kernel, Window::DimY);
+
+ if(_is_nchw)
{
- NEScheduler::get().schedule(&_fill_border, Window::DimX);
- NEScheduler::get().schedule(&_depthwise_conv_kernel, Window::DimY);
+ _permute_output.run();
}
if(_is_activationlayer_enabled)
@@ -1008,24 +523,98 @@
}
}
-void NEDepthwiseConvolutionLayer::prepare()
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::prepare()
{
- if(!_is_prepared && !_is_optimized)
+ if(!_is_prepared)
{
ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
- if(_is_nhwc)
- {
- _permute_weights.run();
- }
-
- // Run reshape and mark original weights as unused
- _weights_reshaped.allocator()->allocate();
- NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
- NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
+ _permute_weights.run();
_original_weights->mark_as_unused();
-
_is_prepared = true;
}
}
+
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(std::move(memory_manager)), _func_generic()
+{
+}
+
+void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info, const Size2D &dilation)
+{
+ _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation);
+ switch(_depth_conv_func)
+ {
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ _func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
+ _func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+ }
+}
+
+Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+{
+ DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ switch(depth_conv_func)
+ {
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ return NEDepthwiseConvolutionLayerOptimized::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
+ return NEDepthwiseConvolutionLayerGeneric::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+ }
+}
+
+DepthwiseConvolutionFunction NEDepthwiseConvolutionLayer::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+{
+ if(bool(NEDepthwiseConvolutionLayerOptimized::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)))
+ {
+ return DepthwiseConvolutionFunction::OPTIMIZED;
+ }
+ else
+ {
+ return DepthwiseConvolutionFunction::GENERIC;
+ }
+}
+
+void NEDepthwiseConvolutionLayer::run()
+{
+ switch(_depth_conv_func)
+ {
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ _func_optimized.run();
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
+ _func_generic.run();
+ break;
+ default:
+ ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+ }
+}
+
+void NEDepthwiseConvolutionLayer::prepare()
+{
+ switch(_depth_conv_func)
+ {
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ _func_optimized.prepare();
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
+ _func_generic.prepare();
+ break;
+ default:
+ ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+ }
+}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
deleted file mode 100644
index da2e49c..0000000
--- a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-NEDepthwiseSeparableConvolutionLayer::NEDepthwiseSeparableConvolutionLayer()
- : _depthwise_conv(), _pointwise_conv()
-{
-}
-
-void NEDepthwiseSeparableConvolutionLayer::configure(ITensor *input, const ITensor *depthwise_weights, const ITensor *depthwise_biases, ITensor *depthwise_out,
- const ITensor *pointwise_weights, const ITensor *pointwise_biases, ITensor *output,
- const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
-{
- _depthwise_conv.configure(input, depthwise_weights, depthwise_biases, depthwise_out, depthwise_conv_info);
- _pointwise_conv.configure(depthwise_out, pointwise_weights, pointwise_biases, output, pointwise_conv_info);
-}
-
-void NEDepthwiseSeparableConvolutionLayer::run()
-{
- prepare();
-
- _depthwise_conv.run();
- _pointwise_conv.run();
-}
-
-void NEDepthwiseSeparableConvolutionLayer::prepare()
-{
- _depthwise_conv.prepare();
- _pointwise_conv.prepare();
-}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
new file mode 100644
index 0000000..d1d1343
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <ios>
+#include <list>
+
+namespace arm_compute
+{
+NEDetectionPostProcessLayer::NEDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _dequantize(), _detection_post_process(), _decoded_scores(), _run_dequantize(false)
+{
+}
+
+void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors,
+ ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores);
+ ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(),
+ output_scores->info(),
+ num_detection->info(), info));
+
+ const ITensor *input_scores_to_use = input_scores;
+ DetectionPostProcessLayerInfo info_to_use = info;
+ _run_dequantize = is_data_type_quantized(input_box_encoding->info()->data_type());
+
+ if(_run_dequantize)
+ {
+ _memory_group.manage(&_decoded_scores);
+
+ _dequantize.configure(input_scores, &_decoded_scores);
+
+ input_scores_to_use = &_decoded_scores;
+
+ // Create a new info struct to avoid dequantizing in the CPP layer
+ std::array<float, 4> scales_values{ info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), info.scale_value_w() };
+ DetectionPostProcessLayerInfo info_quantized(info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), info.num_classes(),
+ scales_values, info.use_regular_nms(), info.detection_per_class(), false);
+ info_to_use = info_quantized;
+ }
+
+ _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, output_classes, output_scores, num_detection, info_to_use);
+ _decoded_scores.allocator()->allocate();
+}
+
+Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_scores, const ITensorInfo *input_anchors,
+ ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info)
+{
+ bool run_dequantize = is_data_type_quantized(input_box_encoding->data_type());
+ if(run_dequantize)
+ {
+ TensorInfo decoded_classes_info = input_scores->clone()->set_is_resizable(true).set_data_type(DataType::F32);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(input_scores, &decoded_classes_info));
+ }
+ ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info));
+
+ return Status{};
+}
+
+void NEDetectionPostProcessLayer::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Decode scores if necessary
+ if(_run_dequantize)
+ {
+ _dequantize.run();
+ }
+ _detection_post_process.run();
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 12a5a1d..ee622f4 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -74,10 +74,11 @@
return NETransposeKernel::validate(input, output);
}
-NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), _accumulate_biases_kernel(),
- _flatten_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false),
- _is_fc_after_conv(false), _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
+NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+ : _memory_group(std::move(memory_manager)), _weights_manager(weights_manager), _flatten_kernel(), _convert_weights(), _convert_weights_managed(), _reshape_weights_function(),
+ _reshape_weights_managed_function(), _mm_gemm(nullptr, weights_manager), _mm_gemmlowp(), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(),
+ _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), _accumulate_biases(false),
+ _is_quantized(false), _is_prepared(false)
{
}
@@ -155,6 +156,11 @@
_is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
_original_weights = weights;
+ if(_weights_manager)
+ {
+ _weights_manager->manage(weights);
+ }
+
// Configure gemmlowp output
if(_is_quantized)
{
@@ -194,21 +200,39 @@
// Reshape weights if needed
if(!_are_weights_reshaped)
{
- // Reshape the weights
- _reshape_weights_function.configure(weights, &_reshape_weights_output);
- weights_to_use = &_reshape_weights_output;
+ if(_weights_manager && _weights_manager->are_weights_managed(weights))
+ {
+ _reshape_weights_managed_function.configure(weights);
+ weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed_function);
+ }
+ else
+ {
+ // Reshape the weights
+ _reshape_weights_function.configure(weights, &_reshape_weights_output);
+ weights_to_use = &_reshape_weights_output;
+ }
}
// Convert weights if needed
if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
{
- // Convert weights
- _convert_weights.configure(weights_to_use,
- &_converted_weights_output,
- input->info()->tensor_shape(),
- fc_info.weights_trained_layout);
+ if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use))
+ {
+ _convert_weights_managed.configure(weights_to_use,
+ input->info()->tensor_shape(),
+ fc_info.weights_trained_layout);
+ weights_to_use = _weights_manager->acquire(weights, &_convert_weights_managed);
+ }
+ else
+ {
+ // Convert weights
+ _convert_weights.configure(weights_to_use,
+ &_converted_weights_output,
+ input->info()->tensor_shape(),
+ fc_info.weights_trained_layout);
- weights_to_use = &_converted_weights_output;
+ weights_to_use = &_converted_weights_output;
+ }
_are_weights_converted = false;
}
@@ -381,7 +405,10 @@
{
if(!_is_prepared)
{
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+ if(!_weights_manager)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+ }
auto release_unused = [](Tensor * w)
{
@@ -397,22 +424,39 @@
// Reshape of the weights (happens only once)
if(!_are_weights_reshaped)
{
- // Run reshape weights kernel and mark weights as unused
- _reshape_weights_output.allocator()->allocate();
- _reshape_weights_function.run();
-
- cur_weights->mark_as_unused();
- cur_weights = &_reshape_weights_output;
+ if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+ {
+ cur_weights = _weights_manager->run(cur_weights, &_reshape_weights_managed_function);
+ }
+ else
+ {
+ // Reshape of the weights (happens only once)
+ if(!_are_weights_reshaped)
+ {
+ // Run reshape weights kernel and mark weights as unused
+ _reshape_weights_output.allocator()->allocate();
+ _reshape_weights_function.run();
+ }
+ cur_weights->mark_as_unused();
+ cur_weights = &_reshape_weights_output;
+ }
_are_weights_reshaped = true;
}
// Convert weights if needed (happens only once)
if(!_are_weights_converted)
{
- _converted_weights_output.allocator()->allocate();
- _convert_weights.run();
+ if(_weights_manager && _weights_manager->are_weights_managed(cur_weights))
+ {
+ _weights_manager->run(cur_weights, &_convert_weights_managed);
+ }
+ else
+ {
+ _converted_weights_output.allocator()->allocate();
+ _convert_weights.run();
+ cur_weights->mark_as_unused();
+ }
- cur_weights->mark_as_unused();
_are_weights_converted = true;
}
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 37d0e09..baa22b7 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -34,7 +34,6 @@
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
#include "arm_compute/runtime/TensorAllocator.h"
-#include "support/ToolchainSupport.h"
#include <cmath>
@@ -42,9 +41,10 @@
namespace arm_compute
{
-NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr),
- _run_vector_matrix_multiplication(false), _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
+NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+ : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager, weights_manager), _ma_kernel(),
+ _alpha_scale_func(nullptr), _add_bias_kernel(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false),
+ _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
{
}
@@ -52,34 +52,55 @@
{
ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
+ const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
+ bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), gemm_info));
+
// Check if we need to reshape the matrix B only on the first run
_is_prepared = false;
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
_original_b = b;
-
- bool run_optimised = c == nullptr && bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
+ _run_alpha_scale = alpha != 1.f;
+ _run_bias_addition = c != nullptr && gemm_info.reshape_b_only_on_first_run();
+ _run_addition = beta != 0 && c != nullptr && !gemm_info.reshape_b_only_on_first_run();
+ _run_activation = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !NEGEMMAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
if(run_optimised)
{
+ const ITensor *c_to_use = is_c_bias ? c : nullptr;
if(MEMInfo::get_policy() == MemoryPolicy::MINIMIZE)
{
GEMMInfo gemm_info_ntb = gemm_info;
gemm_info_ntb.set_pretranpose_B(false);
- _asm_glue.configure(a, b, c, d, alpha, beta, gemm_info_ntb);
+ _asm_glue.configure(a, b, c_to_use, d, gemm_info_ntb);
}
else
{
- _asm_glue.configure(a, b, c, d, alpha, beta, gemm_info);
+ _asm_glue.configure(a, b, c_to_use, d, gemm_info);
}
ARM_COMPUTE_ERROR_ON(!_asm_glue.is_configured());
+
+ // Scale product by alpha
+ if(_run_alpha_scale)
+ {
+ _alpha_scale_func.configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
+ }
}
else
{
+ // Pick output tensor in case bias addition should be performed
+ ITensor *gemm_output_to_use = d;
+ if(_run_bias_addition)
+ {
+ gemm_output_to_use = &_tmp_d;
+ _memory_group.manage(&_tmp_d);
+ }
+
+ // Select between GEMV and GEMM
if(_run_vector_matrix_multiplication)
{
// Configure the matrix multiply kernel
- _mm_kernel.configure(a, b, d, alpha, false);
+ _mm_kernel.configure(a, b, gemm_output_to_use, alpha, false);
}
else
{
@@ -117,7 +138,7 @@
_transpose_kernel.configure(b, &_tmp_b);
// Configure matrix multiplication kernel
- _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha, true, GEMMReshapeInfo(m, n, k));
+ _mm_kernel.configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
// Allocate once the all configure methods have been called
_tmp_a.allocator()->allocate();
@@ -127,18 +148,31 @@
}
}
- // Configure matrix addition kernel
- if(beta != 0 && c != nullptr)
+ if(_run_bias_addition)
{
- _ma_kernel.configure(c, d, beta);
- _run_addition = true;
+ _add_bias_kernel.configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
+ _tmp_d.allocator()->allocate();
}
}
+
+ // Configure matrix addition kernel
+ if(_run_addition)
+ {
+ _ma_kernel.configure(c, d, beta);
+ }
+
+ // Configure activation
+ const ActivationLayerInfo &activation = gemm_info.activation_info();
+ if(_run_activation)
+ {
+ _activation_func.configure(d, nullptr, activation);
+ }
}
Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_UNUSED(alpha);
+ const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F16, DataType::F32);
@@ -147,7 +181,7 @@
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
- if(c != nullptr)
+ if(c != nullptr && !is_c_bias)
{
ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
@@ -178,7 +212,7 @@
}
// Check if we need to run the optimized assembly kernel
- const bool run_optimised = c == nullptr && bool(NEGEMMAssemblyDispatch::validate(a, b, c, output, alpha, beta, gemm_info));
+ const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, gemm_info));
if(!run_optimised)
{
@@ -225,14 +259,26 @@
// Validate matrix multiply
auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
+
+ if(c != nullptr && gemm_info.reshape_b_only_on_first_run())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&tmp_output_info, c, output, ConvertPolicy::SATURATE));
+ }
}
// Validate matrix addition kernel
- if(beta != 0 && c != nullptr)
+ if(beta != 0 && c != nullptr && !is_c_bias)
{
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAdditionKernel::validate(c, output, beta));
}
+ // Validate activation
+ const ActivationLayerInfo &activation = gemm_info.activation_info();
+ if(activation.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
+ }
+
return Status{};
}
@@ -245,6 +291,10 @@
if(_asm_glue.is_configured())
{
_asm_glue.run();
+ if(_run_alpha_scale)
+ {
+ _alpha_scale_func.run();
+ }
}
else
{
@@ -262,12 +312,24 @@
NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
- // Run matrix addition kernel
- if(_run_addition)
+ // Run bias addition kernel
+ if(_run_bias_addition)
{
- NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY);
}
}
+
+ // Run matrix addition kernel
+ if(_run_addition)
+ {
+ NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+ }
+
+ // Run activation function
+ if(_run_activation)
+ {
+ _activation_func.run();
+ }
}
void NEGEMM::prepare()
@@ -276,13 +338,19 @@
{
if(_asm_glue.is_configured())
{
- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+ if(!_weights_manager || !_weights_manager->are_weights_managed(_original_b))
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+ }
_asm_glue.prepare();
}
else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue.is_configured())
{
- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+ if(!_weights_manager || !_weights_manager->are_weights_managed(_original_b))
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+ }
_tmp_b.allocator()->allocate();
NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 2a4498b..24254eb 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -24,10 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
-#include "arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h"
#include <arm_neon.h>
@@ -35,63 +33,127 @@
{
namespace
{
-std::unique_ptr<IFunction> create_function_all_types(const arm_gemm::KernelDescription &gemm_kernel_info,
- const ITensor *a, const ITensor *b, ITensor *d,
- float alpha, float beta, const GEMMInfo &gemm_info,
- std::shared_ptr<IMemoryManager> memory_manager)
-
+arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
{
- // Note: It's safe to not check for FP16 support because this was already checked in NEGEMMAssemblyDispatch::configure()
- switch(gemm_kernel_info.method)
+ arm_gemm::Activation gemm_act;
+
+ // Early exit in case lower bound is other than 0, as it's not yet supported
+ if(act.b() != 0.f)
{
- case arm_gemm::GemmMethod::GEMM_INTERLEAVED:
- {
- if(!gemm_info.pretranpose_B())
- {
- return nullptr;
- }
- auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
- function->configure(a, b, d, alpha, beta, gemm_info);
- return std::move(function);
- }
-#if defined(__aarch64__)
- case arm_gemm::GemmMethod::GEMM_NATIVE:
- {
- if(gemm_kernel_info.name.find("sgemm_native_16x4") != std::string::npos)
- {
- auto kernel = support::cpp14::make_unique<NEGEMMNativeWrapperKernel<float, float>>();
- kernel->configure(a, b, d, alpha, beta, gemm_info);
- auto function = support::cpp14::make_unique<NESimpleAssemblyFunction>();
- function->configure(std::move(kernel));
- return std::move(function);
- }
- return nullptr;
- }
-#endif // defined(__aarch64__)
- default:
- return nullptr;
+ return gemm_act;
}
+
+ switch(act.activation())
+ {
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ gemm_act.type = arm_gemm::Activation::Type::ReLU;
+ break;
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ gemm_act.type = arm_gemm::Activation::Type::BoundedReLU;
+ gemm_act.param1 = act.a();
+ gemm_act.param2 = 0.f;
+ break;
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ gemm_act.type = arm_gemm::Activation::Type::BoundedReLU;
+ gemm_act.param1 = act.a();
+ gemm_act.param2 = act.b();
+ break;
+ default:
+ gemm_act.type = arm_gemm::Activation::Type::None;
+ }
+
+ return gemm_act;
}
+template <typename TypeInput, typename TypeOutput>
+class FallbackTransform : public ITransformWeights
+{
+public:
+ void run() override
+ {
+ _output.allocator()->allocate();
+ ARM_COMPUTE_ERROR_ON(_output.buffer() == nullptr);
+ _gemm_kernel_asm->pretranspose_B_array(_output.buffer(), _in1_ptr, _ldb, _multi_stride_b);
+ _reshape_run = true;
+ }
+
+ void release() override
+ {
+ _output.allocator()->free();
+ }
+
+ ITensor *get_weights() override
+ {
+ return &_output;
+ }
+
+ uint32_t uid() override
+ {
+ uint32_t id = (_B_pretranspose_size | 0x80000000);
+ return id;
+ }
+
+ void configure(size_t B_pretranspose_size, unsigned int alignment)
+ {
+ _output.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
+ _B_pretranspose_size = B_pretranspose_size;
+ }
+
+ void set_pretranspose(ITensor *tensor)
+ {
+ if(!_reshape_run)
+ {
+ _gemm_kernel_asm->set_pretransposed_B_data(tensor->buffer());
+ }
+ }
+
+ void set_args(const int ldb, const TypeInput *in1_ptr, const int multi_stride_b, std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> gemm_kernel_asm)
+ {
+ _ldb = ldb;
+ _in1_ptr = in1_ptr;
+ _multi_stride_b = multi_stride_b;
+ _gemm_kernel_asm = gemm_kernel_asm;
+ }
+
+private:
+ Tensor _output{};
+ int _ldb{};
+ const TypeInput *_in1_ptr{};
+ int _multi_stride_b{};
+ size_t _B_pretranspose_size{};
+ std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
+};
+
/** Fallback in case ACL doesn't have a function */
template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing>
class Fallback : public NEGEMMAssemblyDispatch::IFallback
{
public:
+ /** Destructor */
+ ~Fallback()
+ {
+ // Release memory if we have allocated the memory ourselves
+ if(_pretranspose && !(_weights_manager && _weights_manager->are_weights_managed(_b)))
+ {
+ delete _pretranspose;
+ }
+ }
+
/** Initialise the functions's input and output.
*
- * @param[in] a Input tensor containing the Matrix A.
- * @param[in] b Input tensor containing the Matrix B.
- * @param[in] c Input tensor containing the Matrix C.
- * @param[out] d Output tensor to store the result of matrix multiplication.
- * @param[in] args Matrix multiplication information.
- * @param[in] gemm_info GEMM meta-data
- * @param[in] memory_group Memory group to be used by the function.
- * @param[in] os Output stage meta-data.
+ * @param[in] a Input tensor containing the Matrix A.
+ * @param[in] b Input tensor containing the Matrix B.
+ * @param[in] c Input tensor containing the Matrix C.
+ * @param[out] d Output tensor to store the result of matrix multiplication.
+ * @param[in] args Matrix multiplication information.
+ * @param[in] gemm_info GEMM meta-data
+ * @param[in] memory_group Memory group to be used by the function.
+ * @param[in] weights_manager Weights manager to be used by the function.
+ * @param[in] os Output stage meta-data.
*/
void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d,
- arm_gemm::GemmArgs<TypeOutput> args, const GEMMInfo &gemm_info,
- MemoryGroup &memory_group, const OutputStage &os = {});
+ arm_gemm::GemmArgs args, const GEMMInfo &gemm_info,
+ MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {});
// Inherited methods overridden:
void run() override;
@@ -108,7 +170,7 @@
void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment);
/** Assembly Gemm kernel */
- std::unique_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
+ std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
/** Optimised NEON kernel */
std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
/** Input A */
@@ -130,23 +192,30 @@
/** GEMM workspace */
Tensor _workspace{};
/** Pre-transpose tensor */
- Tensor _pretranspose{};
+ ITensor *_pretranspose{ nullptr };
/** Prepared flag */
bool _is_prepared{ false };
/** GEMM meta-data */
GEMMInfo _gemm_info{};
+ /** Weights manager */
+ IWeightsManager *_weights_manager{ nullptr };
+ /** Weights transform object */
+ FallbackTransform<TypeInput, TypeOutput> _weights_transform{};
+ /** GEMM kernel description */
+ arm_gemm::KernelDescription _kernel_info{};
};
template <typename TypeInput, typename TypeOutput, class OutputStage>
void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d,
- arm_gemm::GemmArgs<TypeOutput> args, const GEMMInfo &gemm_info,
- MemoryGroup &memory_group, const OutputStage &os)
+ arm_gemm::GemmArgs args, const GEMMInfo &gemm_info,
+ MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os)
{
- arm_gemm::GemmConfig gemm_cfg;
- const arm_gemm::KernelDescription gemm_kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os);
- if(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
+ arm_gemm::GemmConfig gemm_cfg;
+ _kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os);
+ _weights_manager = weights_manager;
+ if(_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
{
- gemm_cfg.filter = gemm_kernel_info.name;
+ gemm_cfg.filter = _kernel_info.name;
args._cfg = &gemm_cfg;
}
_gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os);
@@ -190,7 +259,16 @@
// Forcing 128-byte alignment (required by 32-bit kernels)
const unsigned int alignment = 128;
const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
- _pretranspose.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
+ if(weights_manager && _weights_manager->are_weights_managed(b))
+ {
+ _weights_transform.configure(B_pretranspose_size, alignment);
+ _pretranspose = _weights_manager->acquire(b, &_weights_transform);
+ }
+ else
+ {
+ _pretranspose = new Tensor();
+ static_cast<Tensor *>(_pretranspose)->allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
+ }
}
}
@@ -202,20 +280,34 @@
// Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
if(_c && _c->info()->data_type() == DataType::S32)
{
- _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(_c->buffer() + _c->info()->offset_first_element_in_bytes()));
+ _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(_c->buffer() + _c->info()->offset_first_element_in_bytes()), 0);
}
// Pretranspose B if required
if(_gemm_kernel_asm->B_pretranspose_required())
{
- _pretranspose.allocator()->allocate();
- ARM_COMPUTE_ERROR_ON(_pretranspose.buffer() == nullptr);
const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
const int multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
- _gemm_kernel_asm->pretranspose_B_array(_pretranspose.buffer(), in1_ptr, ldb, multi_stride_b);
- _b->mark_as_unused();
+ if(_weights_manager && _weights_manager->are_weights_managed(_b))
+ {
+ _weights_transform.set_args(ldb, in1_ptr, multi_stride_b, _gemm_kernel_asm);
+ _weights_manager->run(_b, &_weights_transform);
+
+ // If we didn't run the reshape function, set the pretransposed buffer
+ if(!_weights_transform.is_reshape_run())
+ {
+ _weights_transform.set_pretranspose(_pretranspose);
+ }
+ }
+ else
+ {
+ static_cast<Tensor *>(_pretranspose)->allocator()->allocate();
+ ARM_COMPUTE_ERROR_ON(_pretranspose->buffer() == nullptr);
+ _gemm_kernel_asm->pretranspose_B_array(_pretranspose->buffer(), in1_ptr, ldb, multi_stride_b);
+ _b->mark_as_unused();
+ }
}
_is_prepared = true;
@@ -284,105 +376,122 @@
// Prepare assembly kernel
prepare();
+ TypeOutput *bias = nullptr;
+ // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
+ if(_c && _c->info()->data_type() != DataType::S32)
+ {
+ bias = reinterpret_cast<TypeOutput *>(_c->buffer() + _c->info()->offset_first_element_in_bytes());
+ }
// Set gemm parameters
- _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d);
+ _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
+ in1_ptr, ldb, multi_stride_b,
+ out_ptr, ldd, batch_stride_d, multi_stride_d,
+ bias, 0);
// Schedule assembly kernel
- NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX);
-}
-
-template <typename TypeInput, typename TypeOutput>
-void create_function_or_arm_gemm(std::unique_ptr<IFunction> &acl_function, std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
- const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info,
- std::shared_ptr<IMemoryManager> memory_manager)
-{
- INEGEMMWrapperKernel::Params p = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- unsigned int num_threads = NEScheduler::get().num_threads();
-
- arm_gemm::GemmArgs<TypeOutput> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, gemm_info.pretranpose_B());
-
- // Try to create an ACL function:
- const arm_gemm::KernelDescription gemm_kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args);
- acl_function = create_function_all_types(gemm_kernel_info, a, b, d, alpha, beta, gemm_info, std::move(memory_manager));
-
- // If we still don't have an ACL function:
- if(acl_function == nullptr)
+ IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
+ if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && _d->info()->data_type() == DataType::F32)
{
- //Fallback onto arm_gemm function if ACL doesn't support this method.
- auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput>>();
- fallback->configure(a, b, c, d, args, gemm_info, memory_group);
- arm_gemm = std::move(fallback);
+ const int granule_threshold = 200;
+ scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
}
+ NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
}
template <typename TypeInput, typename TypeOutput>
-void create_function_or_arm_gemm_quant(std::unique_ptr<IFunction> &acl_function, std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
- const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info,
- std::shared_ptr<IMemoryManager> memory_manager)
+void create_arm_gemm(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
+ const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info,
+ IWeightsManager *weights_manager)
{
INEGEMMWrapperKernel::Params p = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
const CPUInfo &ci = NEScheduler::get().cpu_info();
unsigned int num_threads = NEScheduler::get().num_threads();
- arm_gemm::GemmArgs<TypeOutput> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, gemm_info.pretranpose_B());
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, gemm_info.pretranpose_B());
+
+ // Create arm_gemm fallback
+ auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput>>();
+ fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager);
+ arm_gemm = std::move(fallback);
+}
+
+template <typename TypeInput, typename TypeOutput>
+void create_arm_gemm_quant(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
+ const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info,
+ IWeightsManager *weights_manager)
+{
+ INEGEMMWrapperKernel::Params p = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ unsigned int num_threads = NEScheduler::get().num_threads();
+
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, gemm_info.pretranpose_B());
// Configure requantization info
const int32_t a_offset = -a->info()->quantization_info().uniform().offset;
const int32_t b_offset = -b->info()->quantization_info().uniform().offset;
const GEMMLowpOutputStageInfo os_info = gemm_info.gemmlowp_output_stage();
- const arm_gemm::ARequantizeLayer32 gemm_requant_info(nullptr,
+ const arm_gemm::ARequantizeLayer32 gemm_requant_info(nullptr, 0,
a_offset, b_offset, os_info.gemmlowp_offset,
-os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
- // Try to create an ACL function:
- const arm_gemm::KernelDescription gemm_kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args, gemm_requant_info);
- acl_function = create_function_all_types(gemm_kernel_info, a, b, d, alpha, beta, gemm_info, std::move(memory_manager));
-
- // If we still don't have an ACL function:
- if(acl_function == nullptr)
- {
- // Fallback onto arm_gemm function if ACL doesn't support this method.
- auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::ARequantizeLayer32>>();
- fallback->configure(a, b, c, d, args, gemm_info, memory_group, gemm_requant_info);
- arm_gemm = std::move(fallback);
- }
+ // Create arm_gemm fallback
+ auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::ARequantizeLayer32>>();
+ fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager, gemm_requant_info);
+ arm_gemm = std::move(fallback);
}
} //namespace
-NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager)
- : _function(nullptr), _arm_gemm(nullptr), _memory_group(memory_manager), _memory_manager(memory_manager)
+NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+ : _arm_gemm(nullptr), _memory_group(std::move(memory_manager)), _weights_manager(weights_manager)
{
}
-Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
+Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_UNUSED(alpha, beta, gemm_info);
+ ARM_COMPUTE_UNUSED(gemm_info);
ARM_COMPUTE_UNUSED(c);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
#ifndef __aarch64__
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 || a->data_type() == DataType::S8 || a->data_type() == DataType::QASYMM8, "8bit integer types only supported for aarch64");
#endif /* __aarch64__ */
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::U8, DataType::QASYMM8, DataType::S8, DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+ DataType::F16, DataType::F32);
+ if(is_data_type_quantized_per_channel(b->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::QASYMM8, "Only QASYMM8 output supported for QASYMM8 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8_SIGNED && d->data_type() != DataType::S32, "Only S32 output supported for QASYMM8_SIGNED input");
return Status{};
}
-void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
+bool NEGEMMAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
+{
+ arm_gemm::Activation act = map_to_arm_gemm_activation(activation);
+ return act.type != arm_gemm::Activation::Type::None;
+}
+
+void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+ arm_gemm::Activation act = map_to_arm_gemm_activation(gemm_info.activation_info());
//If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
- if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), alpha, beta, gemm_info))
+ if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), gemm_info))
{
return;
}
@@ -390,27 +499,28 @@
switch(a->info()->data_type())
{
case DataType::F32:
- create_function_or_arm_gemm<float, float>(_function, _arm_gemm, _memory_group, a, b, c, d, alpha, beta, gemm_info, _memory_manager);
+ create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
break;
#ifdef __aarch64__
case DataType::U8:
case DataType::QASYMM8:
if(d->info()->data_type() == DataType::S32)
{
- create_function_or_arm_gemm<uint8_t, uint32_t>(_function, _arm_gemm, _memory_group, a, b, c, d, alpha, beta, gemm_info, _memory_manager);
+ create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
}
else
{
- create_function_or_arm_gemm_quant<uint8_t, uint8_t>(_function, _arm_gemm, _memory_group, a, b, c, d, alpha, beta, gemm_info, _memory_manager);
+ create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
}
break;
case DataType::S8:
- create_function_or_arm_gemm<int8_t, int32_t>(_function, _arm_gemm, _memory_group, a, b, c, d, alpha, beta, gemm_info, _memory_manager);
+ case DataType::QASYMM8_SIGNED:
+ create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
break;
#endif /* __aarch64__ */
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- create_function_or_arm_gemm<float16_t, float16_t>(_function, _arm_gemm, _memory_group, a, b, c, d, alpha, beta, gemm_info, _memory_manager);
+ create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
default:
@@ -420,33 +530,20 @@
void NEGEMMAssemblyDispatch::prepare()
{
- if(_function != nullptr)
- {
- _function->prepare();
- }
- else
- {
- ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
- _arm_gemm->prepare();
- }
+ ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+ _arm_gemm->prepare();
}
bool NEGEMMAssemblyDispatch::is_configured() const
{
- return (_arm_gemm != nullptr && _arm_gemm->is_configured()) || _function != nullptr;
+ return _arm_gemm != nullptr && _arm_gemm->is_configured();
}
void NEGEMMAssemblyDispatch::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
- if(_function != nullptr)
- {
- _function->run();
- }
- else
- {
- ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
- _arm_gemm->run();
- }
+
+ ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+ _arm_gemm->run();
}
} //namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index e94c893..a730749 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -29,9 +29,7 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/ToolchainSupport.h"
-#include <cmath>
#include <set>
#include <tuple>
@@ -50,7 +48,6 @@
ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(),
(biases != nullptr) ? biases->info() : nullptr,
output->info()));
-
const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
const ITensor *biases_to_use = (append_biases) ? biases : nullptr;
@@ -62,7 +59,7 @@
Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
if(biases != nullptr)
@@ -89,105 +86,99 @@
NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
}
-NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
- : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
- _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false),
- _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
+NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager)
+ : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager),
+ _col2im_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _skip_im2col(false),
+ _skip_col2im(false), _is_quantized(false), _is_prepared(false)
{
}
void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info, int gemm_3d_depth)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
- ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(), act_info, gemm_3d_depth,
- _skip_im2col));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(),
+ act_info, gemm_3d_depth, _skip_im2col));
+ // Create GEMMInfo structure
const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
+ gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
+ false, GEMMLowpOutputStageInfo(), false, false, act_info);
+
+ // Supported activations in GEMM
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
if(_is_quantized)
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
- const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wqinfo = weights->info()->quantization_info().uniform();
+ const QuantizationInfo iqinfo = input->info()->quantization_info();
+ const QuantizationInfo wqinfo = weights->info()->quantization_info();
+ const QuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info();
+ const UniformQuantizationInfo uiqinfo = iqinfo.uniform();
+ const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
- input->info()->set_quantization_info(QuantizationInfo(iqinfo.scale, -iqinfo.offset));
- weights->info()->set_quantization_info(QuantizationInfo(wqinfo.scale, -wqinfo.offset));
-
- const UniformQuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info().uniform();
-
- float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
- int output_multiplier;
- int output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
+ if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
+ {
+ const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
+ weights->info()->set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
+ }
// Merge activation with output stage
int min_activation = 0;
int max_activation = 255;
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
- if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+ if(supported_acts.count(act_info.activation()) != 0)
{
- const int a_const_int = quantize_qasymm8(act_info.a(), oqinfo);
- const int b_const_int = quantize_qasymm8(act_info.b(), oqinfo);
+ const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo);
+ const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo);
- min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oqinfo.offset : b_const_int;
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int;
max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
-
- _is_activationlayer_enabled = false;
}
GEMMLowpOutputStageInfo output_info;
- output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- output_info.gemmlowp_offset = oqinfo.offset;
- output_info.gemmlowp_multiplier = output_multiplier;
- output_info.gemmlowp_shift = output_shift;
- output_info.gemmlowp_min_bound = min_activation;
- output_info.gemmlowp_max_bound = max_activation;
+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_info.gemmlowp_offset = uoqinfo.offset;
+ output_info.gemmlowp_min_bound = min_activation;
+ output_info.gemmlowp_max_bound = max_activation;
+ quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info);
_mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info));
// Revert back QuantizatioInfo as input and weights could be used in other convolution layers
- input->info()->set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset));
- weights->info()->set_quantization_info(QuantizationInfo(wqinfo.scale, wqinfo.offset));
+ input->info()->set_quantization_info(iqinfo);
+ weights->info()->set_quantization_info(wqinfo);
}
else
{
// Configure matrix multiply function
- _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+ _mm_gemm.configure(input, weights, biases, output, 1.0f, 0.0f, gemm_info);
}
}
-Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act_info,
- int gemm_3d_depth, bool skip_im2col)
+Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+ const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
{
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
const bool is_activation_enabled = act_info.enabled();
- const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
+ // Create GEMMInfo structure
+ const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+ gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
+ false, GEMMLowpOutputStageInfo(), false, false, act_info);
+
if(is_quantized)
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
- const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
- const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-
- std::unique_ptr<ITensorInfo> input_qa = input->clone();
- std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
- input_qa->set_quantization_info(QuantizationInfo(iqinfo.scale, -iqinfo.offset));
- weights_qa->set_quantization_info(QuantizationInfo(wqinfo.scale, -wqinfo.offset));
-
- const UniformQuantizationInfo oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info().uniform();
-
- float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
- int output_multiplier;
- int output_shift;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
+ const QuantizationInfo &iqinfo = input->quantization_info();
+ const QuantizationInfo &wqinfo = weights->quantization_info();
+ const QuantizationInfo &oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info();
+ const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
// Merge activation with output stage
int min_activation = 0;
@@ -199,22 +190,25 @@
};
if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
{
- const int a_const_int = quantize_qasymm8(act_info.a(), oqinfo);
- const int b_const_int = quantize_qasymm8(act_info.b(), oqinfo);
+ const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo);
+ const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo);
- min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oqinfo.offset : b_const_int;
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int;
max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
}
GEMMLowpOutputStageInfo output_info;
- output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- output_info.gemmlowp_offset = oqinfo.offset;
- output_info.gemmlowp_multiplier = output_multiplier;
- output_info.gemmlowp_shift = output_shift;
- output_info.gemmlowp_min_bound = min_activation;
- output_info.gemmlowp_max_bound = max_activation;
+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_info.gemmlowp_offset = uoqinfo.offset;
+ output_info.gemmlowp_min_bound = min_activation;
+ output_info.gemmlowp_max_bound = max_activation;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info));
// Perform validation step on GEMMLowp
+ std::unique_ptr<ITensorInfo> input_qa = input->clone();
+ std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+ input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
+ weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info));
}
else
@@ -224,7 +218,7 @@
}
}
-Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
{
const DataType data_type = input_info->data_type();
const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth;
@@ -232,7 +226,7 @@
// Set dummy tensor shapes for the validation
const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
- const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type);
+ const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col);
@@ -242,7 +236,7 @@
const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_UNUSED(num_groups);
+ ARM_COMPUTE_UNUSED(num_groups, weights_info);
ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConvolutionLayer::validate(input->info(),
weights->info(),
biases != nullptr ? biases->info() : nullptr,
@@ -262,13 +256,11 @@
const unsigned int kernel_width = weights->info()->dimension(idx_width);
const unsigned int kernel_height = weights->info()->dimension(idx_height);
- _is_prepared = weights_info.retain_internal_weights();
- _original_weights = weights;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _data_layout = data_layout;
- _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
- _append_bias = (biases != nullptr) && (!_is_quantized);
- _is_activationlayer_enabled = act_info.enabled();
+ _is_prepared = weights_info.retain_internal_weights();
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _data_layout = data_layout;
+ _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
const ITensor *gemm_input_to_use = input;
ITensor *gemm_output_to_use = output;
@@ -286,7 +278,7 @@
// Check if GEMM3D is supported
if(data_layout == DataLayout::NHWC)
{
- _skip_col2im = bool(validate_gemm3d(input->info(), act_info, conv_h, true));
+ _skip_col2im = bool(validate_gemm3d(input->info(), weights->info(), act_info, conv_h, true));
// If not supported, we need to perform im2col and col2im (or reshape layer)
if(!_skip_col2im)
{
@@ -298,8 +290,6 @@
_skip_col2im = false;
}
- const ITensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr;
-
// Get parameters from conv_info
unsigned int stride_x = 0;
unsigned int stride_y = 0;
@@ -309,7 +299,18 @@
// _weights_reshaped will be auto configured in the kernel.
// Just append biases and do not transpose 1xW as it will be reshaped in NEGEMM
- _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped);
+ const ITensor *weights_to_use = weights;
+
+ if(_weights_manager && _weights_manager->are_weights_managed(weights))
+ {
+ _reshape_weights_managed.configure(weights, nullptr);
+ weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed);
+ }
+ else
+ {
+ _reshape_weights.configure(weights, nullptr, &_weights_reshaped);
+ weights_to_use = &_weights_reshaped;
+ }
// Create tensor to store im2col reshaped inputs
if(!_skip_im2col)
@@ -317,16 +318,11 @@
_memory_group.manage(&_im2col_output);
// Configure
- _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation);
+ _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation);
// Update GEMM input
gemm_input_to_use = &_im2col_output;
}
- else if(_append_bias)
- {
- // Configure add bias kernel
- _add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
- }
// Create temporary GEMM output tensor in case we cannot skip col2im
if(!_skip_col2im)
@@ -351,7 +347,7 @@
// Configure GEMM
// In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
- configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, gemm_3d_depth);
+ configure_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, gemm_3d_depth);
if(!_skip_im2col)
{
@@ -384,14 +380,6 @@
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
"Output shape does not match the expected one");
-
- // Configure Activation Layer
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(output, nullptr, act_info);
- }
-
- ARM_COMPUTE_UNUSED(weights_info);
}
Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -400,7 +388,7 @@
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON");
@@ -422,10 +410,9 @@
const ITensorInfo *gemm_output_to_use = output;
const ITensorInfo *weights_to_use = weights;
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const bool append_bias = (biases != nullptr) && (!is_quantized);
- bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
- bool is_activation_enabled = act_info.enabled();
+ const bool append_bias = false;
+ const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+ bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
// Get convolved dimensions
unsigned int conv_w = 0;
@@ -442,7 +429,7 @@
bool skip_col2im = false;
if(data_layout == DataLayout::NHWC)
{
- skip_col2im = bool(validate_gemm3d(input, act_info, conv_h, true));
+ skip_col2im = bool(validate_gemm3d(input, weights, act_info, conv_h, true));
// If not supported, we need to perform im2col and col2im (or reshape layer)
if(!skip_col2im)
{
@@ -453,16 +440,13 @@
if(skip_col2im)
{
// If not supported, we need to perform im2col and col2im (or reshape layer)
- if(!bool(validate_gemm3d(input, act_info, conv_h, skip_im2col)))
+ if(!bool(validate_gemm3d(input, weights, act_info, conv_h, skip_im2col)))
{
skip_im2col = false;
skip_col2im = false;
}
}
- const unsigned bias_element = (append_bias && !skip_im2col) ? 1 : 0;
- const ITensorInfo *biases_to_use = (append_bias && !skip_im2col) ? biases : nullptr;
-
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -481,19 +465,14 @@
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
}
- if(act_info.enabled())
- {
- ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
- }
-
unsigned int mat_weights_cols = weights->dimension(idx_kernels);
- unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel) + bias_element;
+ unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
// Output tensor auto inizialization if not yet initialized
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases_to_use, nullptr));
- weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, (append_bias && !skip_im2col)), 1, data_type);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr));
+ weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, data_type);
weights_reshaped_info.set_quantization_info(weights->quantization_info());
- weights_to_use = &weights_reshaped_info;
+ weights_to_use = &weights_reshaped_info;
if(!skip_im2col)
{
@@ -511,11 +490,6 @@
ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
gemm_input_to_use = &im2col_reshaped_info;
}
- else if(append_bias)
- {
- // Validate add bias kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
- }
// Create temporary GEMM output tensor in case we cannot skip col2im
if(!skip_col2im)
@@ -539,12 +513,6 @@
ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h)));
}
- //Validate Activation Layer
- if(is_activation_enabled)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
- }
-
return Status{};
}
@@ -573,11 +541,6 @@
_mm_gemm.run();
}
- if(_skip_im2col && _append_bias)
- {
- NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY);
- }
-
// Reshape output matrix
if(!_skip_col2im)
{
@@ -590,23 +553,23 @@
_reshape_layer.run();
}
}
-
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.run();
- }
}
void NEGEMMConvolutionLayer::prepare()
{
if(!_is_prepared)
{
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- // Run weights reshaping and mark original weights tensor as unused
- _weights_reshaped.allocator()->allocate();
- _reshape_weights.run();
- _original_weights->mark_as_unused();
+ if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+ {
+ _weights_manager->run(_original_weights, &_reshape_weights_managed);
+ }
+ else
+ {
+ // Run weights reshaping and mark original weights tensor as unused
+ _weights_reshaped.allocator()->allocate();
+ _reshape_weights.run();
+ _original_weights->mark_as_unused();
+ }
// Prepare GEMM
_is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
index 63f330b..a478fdd 100644
--- a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,11 +26,12 @@
#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output)
{
auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
k->configure(input, output);
_kernel = std::move(k);
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index aa40113..346d025 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -59,7 +59,7 @@
case DataType::QASYMM8:
case DataType::U8:
{
- _asm_glue.configure(a, b, c, output, 1.f, 0.f, GEMMInfo(false, false, true));
+ _asm_glue.configure(a, b, c, output, GEMMInfo(false, false, true));
run_optimised = _asm_glue.is_configured();
break;
}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index a03ec10..5b9d055 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -42,8 +42,9 @@
NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
- _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _original_b(nullptr), _a_offset(0), _b_offset(0),
- _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false)
+ _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
+ _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
+ _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false), _run_activation(false), _flip_signedness(false)
{
}
@@ -55,6 +56,7 @@
const ITensor *matrix_a = a;
const ITensor *matrix_b = b;
+ GEMMInfo info = gemm_info;
// Clear state
_mtx_a_reshape_kernel = nullptr;
@@ -64,13 +66,44 @@
_a_offset = a->info()->quantization_info().uniform().offset;
_b_offset = b->info()->quantization_info().uniform().offset;
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
- _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
_is_prepared = false;
_fused_assembly_path = false;
+ _flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
_original_b = b;
+ const ITensor *a_to_use = a;
+
+ // Convert to QASYMM8 -> QASYMM8_SIGNED and back
+ if(_flip_signedness)
+ {
+ const int32_t offset_correction = 128;
+ const DataType dt = DataType::QASYMM8_SIGNED;
+ const UniformQuantizationInfo iqinfo = a_to_use->info()->quantization_info().uniform();
+
+ _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
+ _memory_group.manage(&_signed_a);
+ _convert_to_signed_asymm.configure(a_to_use, &_signed_a);
+ a_to_use = &_signed_a;
+ _a_offset = _signed_a.info()->quantization_info().uniform().offset;
+
+ const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
+ _memory_group.manage(&_signed_output);
+ _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
+
+ // Output stage correction
+ GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+ output_stage_corr.gemmlowp_offset = _signed_output.info()->quantization_info().uniform().offset;
+ output_stage_corr.gemmlowp_min_bound -= offset_correction;
+ output_stage_corr.gemmlowp_max_bound -= offset_correction;
+ info.set_gemmlowp_output_stage(output_stage_corr);
+
+ // Update matrix a
+ matrix_a = &_signed_a;
+ }
+
// If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
- if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
{
_fuse_output_stage = true;
_memory_group.manage(&_mm_result_s32);
@@ -82,17 +115,18 @@
switch(a->info()->data_type())
{
case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
case DataType::U8:
case DataType::S8:
{
- if(a->info()->data_type() == DataType::QASYMM8 && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ if(a_to_use->info()->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
- _asm_glue.configure(a, b, c, output, 1.f, 0.f, gemm_info);
+ _asm_glue.configure(a_to_use, b, c, output, gemm_info);
_fused_assembly_path = _asm_glue.is_configured();
}
else
{
- _asm_glue.configure(a, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, 1.f, 0.f, gemm_info);
+ _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
}
_assembly_path = _asm_glue.is_configured();
break;
@@ -110,7 +144,7 @@
matrix_b = &_tmp_b;
// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorInfo a_info(compute_interleaved_shape(*a->info()), 1, a->info()->data_type(), a->info()->quantization_info());
+ TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
_tmp_a.allocator()->init(a_info);
@@ -124,7 +158,7 @@
// Configure interleave kernel
{
auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
- k->configure(a, &_tmp_a);
+ k->configure(a_to_use, &_tmp_a);
_mtx_a_reshape_kernel = std::move(k);
}
@@ -150,19 +184,19 @@
}
// Configure Matrix B reduction kernel
- _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
+ _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
}
// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
if(_b_offset != 0)
{
- TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
+ TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
_vector_sum_row.allocator()->init(info_vector_sum_row);
_memory_group.manage(&_vector_sum_row);
// Configure matrix A reduction kernel
- _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
+ _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), false);
}
if(_fuse_output_stage)
@@ -175,8 +209,17 @@
_mm_kernel = std::move(k);
}
- _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
- _a_offset, _b_offset, gemm_info.gemmlowp_output_stage());
+ _offset_contribution_output_stage_kernel.configure(&_mm_result_s32,
+ _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+ _flip_signedness ? &_signed_output : output,
+ a->info()->dimension(0),
+ _a_offset, _b_offset, info.gemmlowp_output_stage());
+
+ if(_flip_signedness)
+ {
+ _convert_from_signed_asymm.configure(&_signed_output, output);
+ }
}
else
{
@@ -188,10 +231,18 @@
_mm_kernel = std::move(k);
}
// Configure offset contribution kernel
- _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+ _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
}
}
+ // Configure activation
+ const ActivationLayerInfo &activation = gemm_info.activation_info();
+ _run_activation = activation.enabled() && (!_assembly_path || (_assembly_path && !NEGEMMAssemblyDispatch::is_activation_supported(activation)));
+ if(_run_activation)
+ {
+ _activation_func.configure(output, nullptr, activation);
+ }
+
// Allocate tensors
if(!_assembly_path && !_run_vector_matrix_multiplication)
{
@@ -219,22 +270,31 @@
{
_mm_result_s32.allocator()->allocate();
}
+
+ if(_flip_signedness)
+ {
+ _signed_a.allocator()->allocate();
+ _signed_output.allocator()->allocate();
+ }
}
Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
"The product AB is defined only if the number of columns in A is equal to the number of rows in B");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+ GEMMInfo info = gemm_info;
const ITensorInfo *matrix_a_info = a;
const ITensorInfo *matrix_b_info = b;
+ const ITensorInfo *a_to_use = a;
+
TensorInfo tmp_a_info{};
TensorInfo tmp_b_info{};
TensorInfo mm_result_s32_info{};
@@ -242,31 +302,60 @@
int32_t a_offset = a->quantization_info().uniform().offset;
int32_t b_offset = b->quantization_info().uniform().offset;
- bool fuse_output_stage = gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+ bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
if(fuse_output_stage)
{
auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
}
+ // Convert QASYMM8->QASYMM8_SIGNED
+ TensorInfo signed_a{};
+ TensorInfo signed_output{};
+ bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
+ if(flip_signedness)
+ {
+ const int32_t offset_correction = 128;
+ const DataType dt = DataType::QASYMM8_SIGNED;
+ const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
+
+ signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
+ a_to_use = &signed_a;
+ a_offset = signed_a.quantization_info().uniform().offset;
+
+ const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
+ signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+
+ // Output stage correction
+ GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+ output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;
+ output_stage_corr.gemmlowp_min_bound -= offset_correction;
+ output_stage_corr.gemmlowp_max_bound -= offset_correction;
+ info.set_gemmlowp_output_stage(output_stage_corr);
+
+ // Update matrix a
+ matrix_a_info = &signed_a;
+ }
+
// Check if we need to run the optimized assembly kernel
bool run_optimised = false;
bool run_optimised_requantized = false;
- if(is_data_type_quantized_asymmetric(a->data_type()))
+ if(a_to_use->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
- run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, c, output, 1.f, 0.f, gemm_info));
+ run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
run_optimised_requantized = run_optimised;
}
else
{
- run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f, gemm_info));
+ run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
}
if(run_optimised)
{
ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
- if(gemm_info.depth_output_gemm3d() != 0)
+ if(info.depth_output_gemm3d() != 0)
{
- if(gemm_info.reinterpret_input_as_3d())
+ if(info.reinterpret_input_as_3d())
{
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
@@ -283,8 +372,8 @@
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
if(!run_vector_matrix_multiplication)
@@ -303,10 +392,10 @@
shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
// Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(shape_tmp_a));
+ auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
}
}
@@ -331,7 +420,7 @@
info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
// Configure matrix A reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, a->dimension(0), false));
}
if(fuse_output_stage)
@@ -345,8 +434,10 @@
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
a_offset == 0 ? nullptr : &info_vector_sum_col,
b_offset == 0 ? nullptr : &info_vector_sum_row,
- c, output, a_offset, b_offset,
- gemm_info.gemmlowp_output_stage()));
+ c,
+ flip_signedness ? &signed_output : output,
+ a_offset, b_offset,
+ info.gemmlowp_output_stage()));
}
else
{
@@ -361,6 +452,14 @@
a_offset, b_offset));
}
}
+
+ // Validate activation
+ const ActivationLayerInfo &activation = gemm_info.activation_info();
+ if(activation.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
+ }
+
return Status{};
}
@@ -370,6 +469,12 @@
MemoryGroupResourceScope scope_mg(_memory_group);
+ // Convert QASYMM8->QASYMM8_SIGNED
+ if(_flip_signedness)
+ {
+ NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);
+ }
+
// Reshape inputs
if(_mtx_a_reshape_kernel)
{
@@ -415,6 +520,18 @@
NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
}
}
+
+ // Convert QASYMM8_SIGNED->QASYMM8
+ if(_flip_signedness)
+ {
+ NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);
+ }
+
+ // Run fused activation
+ if(_run_activation)
+ {
+ _activation_func.run();
+ }
}
void NEGEMMLowpMatrixMultiplyCore::prepare()
diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
new file mode 100644
index 0000000..7f25b63
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager),
+ _permute_deltas_kernel(),
+ _flatten_deltas_kernel(),
+ _permute_scores_kernel(),
+ _flatten_scores_kernel(),
+ _compute_anchors_kernel(),
+ _bounding_box_kernel(),
+ _pad_kernel(),
+ _dequantize_anchors(),
+ _dequantize_deltas(),
+ _quantize_all_proposals(),
+ _cpp_nms(memory_manager),
+ _is_nhwc(false),
+ _is_qasymm8(false),
+ _deltas_permuted(),
+ _deltas_flattened(),
+ _deltas_flattened_f32(),
+ _scores_permuted(),
+ _scores_flattened(),
+ _all_anchors(),
+ _all_anchors_f32(),
+ _all_proposals(),
+ _all_proposals_quantized(),
+ _keeps_nms_unused(),
+ _classes_nms_unused(),
+ _proposals_4_roi_values(),
+ _all_proposals_to_use(nullptr),
+ _num_valid_proposals(nullptr),
+ _scores_out(nullptr)
+{
+}
+
+void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
+ const GenerateProposalsInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
+ ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+
+ _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC;
+ const DataType scores_data_type = scores->info()->data_type();
+ _is_qasymm8 = scores_data_type == DataType::QASYMM8;
+ const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+ const int total_num_anchors = num_anchors * feat_width * feat_height;
+ const int pre_nms_topN = info.pre_nms_topN();
+ const int post_nms_topN = info.post_nms_topN();
+ const size_t values_per_roi = info.values_per_roi();
+
+ const QuantizationInfo scores_qinfo = scores->info()->quantization_info();
+ const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
+ const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+
+ // Compute all the anchors
+ _memory_group.manage(&_all_anchors);
+ _compute_anchors_kernel.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+
+ const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
+ _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+
+ // Permute and reshape deltas
+ _memory_group.manage(&_deltas_flattened);
+ if(!_is_nhwc)
+ {
+ _memory_group.manage(&_deltas_permuted);
+ _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+ _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+ _deltas_permuted.allocator()->allocate();
+ }
+ else
+ {
+ _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
+ }
+
+ const TensorShape flatten_shape_scores(1, total_num_anchors);
+ _scores_flattened.allocator()->init(TensorInfo(flatten_shape_scores, 1, scores_data_type, scores_qinfo));
+
+ // Permute and reshape scores
+ _memory_group.manage(&_scores_flattened);
+ if(!_is_nhwc)
+ {
+ _memory_group.manage(&_scores_permuted);
+ _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+ _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+ _scores_permuted.allocator()->allocate();
+ }
+ else
+ {
+ _flatten_scores_kernel.configure(scores, &_scores_flattened);
+ }
+
+ Tensor *anchors_to_use = &_all_anchors;
+ Tensor *deltas_to_use = &_deltas_flattened;
+ if(_is_qasymm8)
+ {
+ _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
+ _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
+ _memory_group.manage(&_all_anchors_f32);
+ _memory_group.manage(&_deltas_flattened_f32);
+ // Dequantize anchors to float
+ _dequantize_anchors.configure(&_all_anchors, &_all_anchors_f32);
+ _all_anchors.allocator()->allocate();
+ anchors_to_use = &_all_anchors_f32;
+ // Dequantize deltas to float
+ _dequantize_deltas.configure(&_deltas_flattened, &_deltas_flattened_f32);
+ _deltas_flattened.allocator()->allocate();
+ deltas_to_use = &_deltas_flattened_f32;
+ }
+ // Bounding box transform
+ _memory_group.manage(&_all_proposals);
+ BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f);
+ _bounding_box_kernel.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
+ deltas_to_use->allocator()->allocate();
+ anchors_to_use->allocator()->allocate();
+
+ _all_proposals_to_use = &_all_proposals;
+ if(_is_qasymm8)
+ {
+ _memory_group.manage(&_all_proposals_quantized);
+ // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
+ _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+ _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized);
+ _all_proposals.allocator()->allocate();
+ _all_proposals_to_use = &_all_proposals_quantized;
+ }
+ // The original layer implementation first selects the best pre_nms_topN anchors (thus having a lightweight sort)
+ // that are then transformed by bbox_transform. The boxes generated are then fed into a non-sorting NMS operation.
+ // Since we are reusing the NMS layer and we don't implement any CL/sort, we let NMS do the sorting (of all the input)
+ // and the filtering
+ const int scores_nms_size = std::min<int>(std::min<int>(post_nms_topN, pre_nms_topN), total_num_anchors);
+ const float min_size_scaled = info.min_size() * info.im_scale();
+ _memory_group.manage(&_classes_nms_unused);
+ _memory_group.manage(&_keeps_nms_unused);
+
+ // Note that NMS needs outputs preinitialized.
+ auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
+ auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+ auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
+
+ // Initialize temporaries (unused) outputs
+ _classes_nms_unused.allocator()->init(TensorInfo(TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo));
+ _keeps_nms_unused.allocator()->init(*scores_out->info());
+
+ // Save the output (to map and unmap them at run)
+ _scores_out = scores_out;
+ _num_valid_proposals = num_valid_proposals;
+
+ _memory_group.manage(&_proposals_4_roi_values);
+
+ const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height());
+ _cpp_nms.configure(&_scores_flattened /*scores_in*/,
+ _all_proposals_to_use /*boxes_in,*/,
+ nullptr /* batch_splits_in*/,
+ scores_out /* scores_out*/,
+ &_proposals_4_roi_values /*boxes_out*/,
+ &_classes_nms_unused /*classes*/,
+ nullptr /*batch_splits_out*/,
+ &_keeps_nms_unused /*keeps*/,
+ num_valid_proposals /* keeps_size*/,
+ box_nms_info);
+
+ _keeps_nms_unused.allocator()->allocate();
+ _classes_nms_unused.allocator()->allocate();
+ _all_proposals_to_use->allocator()->allocate();
+ _scores_flattened.allocator()->allocate();
+
+ // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
+ _pad_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+ _proposals_4_roi_values.allocator()->allocate();
+}
+
+Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
+ const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW, DataLayout::NHWC);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
+
+ const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+ const int num_images = scores->dimension(3);
+ const int total_num_anchors = num_anchors * feat_width * feat_height;
+ const int values_per_roi = info.values_per_roi();
+
+ const bool is_qasymm8 = scores->data_type() == DataType::QASYMM8;
+
+ ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
+
+ if(is_qasymm8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
+ const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
+ ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
+ }
+
+ TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+
+ TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
+ TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+ if(scores->data_layout() == DataLayout::NHWC)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+ }
+
+ TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
+
+ TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+ TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(&scores_permuted_info, &scores_flattened_info));
+
+ TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
+ TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
+ if(is_qasymm8)
+ {
+ TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(&all_anchors_info, &all_anchors_f32_info));
+
+ TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+
+ TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+ BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+ proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+ BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+
+ if(num_valid_proposals->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
+ }
+
+ if(proposals->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
+ if(is_qasymm8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
+ const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
+ ARM_COMPUTE_RETURN_ERROR_ON(proposals_qinfo.scale != 0.125f);
+ ARM_COMPUTE_RETURN_ERROR_ON(proposals_qinfo.offset != 0);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(proposals, scores);
+ }
+ }
+
+ if(scores_out->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores_out, scores);
+ }
+
+ return Status{};
+}
+
+void NEGenerateProposalsLayer::run()
+{
+ // Acquire all the temporaries
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Compute all the anchors
+ NEScheduler::get().schedule(&_compute_anchors_kernel, Window::DimY);
+
+ // Transpose and reshape the inputs
+ if(!_is_nhwc)
+ {
+ NEScheduler::get().schedule(&_permute_deltas_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_permute_scores_kernel, Window::DimY);
+ }
+
+ NEScheduler::get().schedule(&_flatten_deltas_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_flatten_scores_kernel, Window::DimY);
+
+ if(_is_qasymm8)
+ {
+ NEScheduler::get().schedule(&_dequantize_anchors, Window::DimY);
+ NEScheduler::get().schedule(&_dequantize_deltas, Window::DimY);
+ }
+
+ // Build the boxes
+ NEScheduler::get().schedule(&_bounding_box_kernel, Window::DimY);
+
+ if(_is_qasymm8)
+ {
+ NEScheduler::get().schedule(&_quantize_all_proposals, Window::DimY);
+ }
+
+ // Non maxima suppression
+ _cpp_nms.run();
+
+ // Add dummy batch indexes
+ NEScheduler::get().schedule(&_pad_kernel, Window::DimY);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
new file mode 100644
index 0000000..d7cb7de
--- /dev/null
+++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+{
+}
+
+void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, float gamma, float beta, float epsilon)
+{
+ const DataLayout data_layout = input->info()->data_layout();
+
+ // Configure Kernels
+ _is_nchw = data_layout == DataLayout::NCHW;
+
+ if(!_is_nchw)
+ {
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+ _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon);
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+
+ _permute_output.configure(&_permuted_output, output != nullptr ? output : input, PermutationVector(2U, 0U, 1U));
+ _permuted_input.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
+ }
+ else
+ {
+ _normalization_kernel.configure(input, output, gamma, beta, epsilon);
+ }
+}
+
+Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
+{
+ return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
+}
+
+void NEInstanceNormalizationLayer::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Permute input
+ if(!_is_nchw)
+ {
+ _permute_input.run();
+ }
+
+ NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ);
+
+ // Permute output
+ if(!_is_nchw)
+ {
+ _permute_output.run();
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
index 264cca0..cfd996b 100644
--- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
@@ -138,8 +138,7 @@
const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
int output_multiplier = 0;
int output_shift = 0;
-
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
_memory_group.manage(&_output_lowp);
_output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
@@ -340,12 +339,13 @@
input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
weights_transposed.set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
- // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
const TensorInfo output_lowp(output_highp.tensor_shape(), 1, DataType::QSYMM16, qsymm_3);
- const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
- ARM_COMPUTE_UNUSED(multiplier);
- ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
+ const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+
// _output_stage
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp));
@@ -483,7 +483,7 @@
_tanh_output_state.run();
_mul3.run();
- // Requantize output state from QSYMM16 to QASYMM16
+ // Requantize output state from QSYMM16 to QASYMM8
_dequantize.run();
_quantize.run();
}
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index c608edf..cf86240 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -34,33 +34,6 @@
{
namespace
{
-TensorInfo get_expected_output_tensorinfo(const ITensorInfo &input, const PaddingList &paddings)
-{
- const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input.tensor_shape(), paddings);
- const TensorInfo expected_output_info = input.clone()->set_tensor_shape(expected_output_shape);
- return expected_output_info;
-}
-
-Status validate_arguments(const ITensorInfo &input, ITensorInfo &output, const PaddingList &paddings)
-{
- const TensorInfo expected_output_info = get_expected_output_tensorinfo(input, paddings);
- auto_init_if_empty(output, expected_output_info);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output, &expected_output_info);
-
- return Status{};
-}
-
-Coordinates get_subtensor_coords(const PaddingList &paddings)
-{
- Coordinates coords;
- for(unsigned int i = 0; i < paddings.size(); ++i)
- {
- coords.set(i, paddings[i].first);
- }
-
- return coords;
-}
-
uint32_t last_padding_dimension(const PaddingList &padding)
{
int last_padding_dim = padding.size() - 1;
@@ -76,23 +49,13 @@
} // namespace
NEPadLayer::NEPadLayer()
- : _copy_kernel(), _mode(), _padding(), _memset_kernel(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results(), _output_subtensor()
+ : _copy_kernel(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
{
}
void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value)
{
- // Auto-init
- auto_init_if_empty(*output->info(), get_expected_output_tensorinfo(*input->info(), padding));
-
- // Create SubTensor (Can use sub-tensor as the kernels to be executed do not require padding)
- _output_subtensor = SubTensor(output, input->info()->tensor_shape(), get_subtensor_coords(padding), true);
-
- // Set the pages of the output to the specified value
- _memset_kernel.configure(output, constant_value);
-
- // Copy the input to the output
- _copy_kernel.configure(input, &_output_subtensor);
+ _pad_kernel.configure(input, output, padding, constant_value, PaddingMode::CONSTANT);
}
void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *output)
@@ -253,11 +216,7 @@
{
case PaddingMode::CONSTANT:
{
- auto output_clone = output->clone();
- SubTensorInfo output_subtensor_info(output_clone.get(), input->tensor_shape(), get_subtensor_coords(padding), true);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output_clone, padding));
- ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(input, &output_subtensor_info));
- break;
+ return NEPadLayerKernel::validate(input, output, padding, constant_value, mode);
}
case PaddingMode::REFLECT:
case PaddingMode::SYMMETRIC:
@@ -293,8 +252,7 @@
{
case PaddingMode::CONSTANT:
{
- NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
- NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_pad_kernel, Window::DimZ);
break;
}
case PaddingMode::REFLECT:
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index 65873b1..4464978 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -27,8 +27,8 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -46,3 +46,4 @@
k->configure(input, output);
_kernel = std::move(k);
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index 9ca7ded..67f4064 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -34,8 +34,8 @@
namespace arm_compute
{
NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
- _is_prepared(false)
+ : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected(memory_manager), _copy_kernel(), _fully_connected_out(), _gemm_output(),
+ _add_output(), _is_prepared(false)
{
}
@@ -81,7 +81,7 @@
// Manage intermediate buffers and configure
_memory_group.manage(&_fully_connected_out);
- _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+ _fully_connected.configure(input, weights, bias, &_fully_connected_out);
_memory_group.manage(&_gemm_output);
_gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
@@ -106,7 +106,7 @@
MemoryGroupResourceScope scope_mg(_memory_group);
- _fully_connected_kernel.run();
+ _fully_connected.run();
_gemm_state_f.run();
@@ -121,7 +121,7 @@
{
if(!_is_prepared)
{
- _fully_connected_kernel.prepare();
+ _fully_connected.prepare();
_gemm_state_f.prepare();
_is_prepared = true;
diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
new file mode 100644
index 0000000..b4e0a2f
--- /dev/null
+++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+Status NEROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(NEROIAlignLayerKernel::validate(input, rois, output, pool_info));
+
+ return Status{};
+}
+
+void NEROIAlignLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+ // Configure ROI pooling kernel
+ auto k = arm_compute::support::cpp14::make_unique<NEROIAlignLayerKernel>();
+ k->configure(input, rois, output, pool_info);
+ _kernel = std::move(k);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 0b145f0..10437f5 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -24,80 +24,127 @@
#include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+inline TensorShape calculate_reduce_mean_shape(ITensor *input, const Coordinates &reduction_axis, bool keep_dims)
+{
+ const int reduction_ops = reduction_axis.num_dimensions();
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
+ convert_negative_axis(axis_local, input_dims);
+ TensorShape out_shape = input->info()->tensor_shape();
+ // Configure reshape layer if we want to drop the dimensions
+ if(!keep_dims)
+ {
+ // We have to sort the reduction axis vectors in order for remove_dimension
+ // to work properly
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for(int i = 0; i < reduction_ops; ++i)
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ return out_shape;
+ }
+ else
+ {
+ for(int i = 0; i < reduction_ops; ++i)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ return out_shape;
+ }
+}
+} // namespace
NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
{
}
-Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
{
ARM_COMPUTE_UNUSED(keep_dims);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
- TensorShape out_shape = input->tensor_shape();
const unsigned int reduction_ops = reduction_axis.num_dimensions();
const int input_dims = input->num_dimensions();
Coordinates axis_local = reduction_axis;
- // Convert negative axis
- for(unsigned int i = 0; i < reduction_ops; ++i)
+ for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
{
- axis_local[i] = wrap_around(axis_local[i], input_dims);
+ //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
}
- std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
- for(unsigned int i = 0; i < reduction_ops; ++i)
+ if(output->tensor_shape().total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
- ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
- if(output->total_size() > 0 && keep_dims)
+ // Only validate if not using auto_init for the output tensor
+ TensorShape out_shape = input->tensor_shape();
+ // Validate output_shape only if not using auto_init
+ convert_negative_axis(axis_local, input_dims);
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for(unsigned int i = 0; i < reduction_ops; ++i)
{
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
+ if(output->total_size() > 0 && keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+ }
+ if(keep_dims)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i]));
+ const unsigned int remove_index = axis_local[i] - i;
+ ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions());
+ out_shape.remove_dimension(remove_index);
+ }
}
- if(keep_dims)
- {
- out_shape.set(axis_local[i], 1);
- }
- else
- {
- out_shape.remove_dimension(axis_local[i] - i);
- }
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
}
- const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
return Status{};
}
+Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+{
+ return validate_config(input, reduction_axis, keep_dims, output);
+}
+
void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
+ // Output auto inizialitation if not yet initialized
+ const TensorShape output_shape = calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
_reduction_ops = reduction_axis.num_dimensions();
_reduction_kernels.resize(_reduction_ops);
_reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
_keep_dims = keep_dims;
- Coordinates axis_local = reduction_axis;
- const int input_dims = input->info()->num_dimensions();
- const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
- // Convert negative axis
- for(unsigned int i = 0; i < reduction_ops; ++i)
- {
- axis_local[i] = wrap_around(axis_local[i], input_dims);
- }
+ convert_negative_axis(axis_local, input_dims);
// Perform reduction for every axis
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(int i = 0; i < _reduction_ops; ++i)
{
TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
@@ -116,7 +163,7 @@
}
// Allocate intermediate tensors
- for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
{
_reduced_outs[i].allocator()->allocate();
}
@@ -125,11 +172,10 @@
if(!keep_dims)
{
TensorShape out_shape = input->info()->tensor_shape();
-
// We have to sort the reduction axis vectors in order for remove_dimension
// to work properly
std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(int i = 0; i < _reduction_ops; ++i)
{
out_shape.remove_dimension(axis_local[i] - i);
}
@@ -141,10 +187,9 @@
void NEReduceMean::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
-
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(auto &kernel : _reduction_kernels)
{
- _reduction_kernels[i].run();
+ kernel.run();
}
if(!_keep_dims)
@@ -152,3 +197,4 @@
_reshape.run();
}
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index dc6cf59..397fe21 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
namespace arm_compute
@@ -52,25 +53,78 @@
}
} // namespace
-NEReductionOperation::NEReductionOperation()
- : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis()
+NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _reduction_kernel(), _fill_border_kernel(), _reshape_kernel(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
{
}
-Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+
+ const auto is_reshape_required = !keep_dims;
+
+ auto *output_internal = output;
+
+ TensorInfo info_before_reshape;
+
+ if(is_reshape_required)
+ {
+ const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
+
+ auto shape_before_reshape = input->tensor_shape();
+ shape_before_reshape.set(axis, 1);
+
+ const auto input_num_channles = input->num_channels();
+ const auto input_qinfo = input->quantization_info();
+ const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+ const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type();
+
+ info_before_reshape.set_data_type(output_data_type).set_tensor_shape(shape_before_reshape).set_num_channels(input_num_channles).set_quantization_info(input_qinfo);
+
+ output_internal = &info_before_reshape;
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output_internal, axis, op));
+
+ if(is_reshape_required)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(output_internal, output));
+ }
return Status{};
}
-void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
+void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op));
+
+ _is_reshape_required = !keep_dims;
+
+ auto *output_internal = output;
+ const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+
+ if(_is_reshape_required)
+ {
+ const auto output_internal_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+ const auto output_external_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+ const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
+ const auto num_channels = input->info()->num_channels();
+ const auto qinfo = input->info()->quantization_info();
+
+ _output_internal.allocator()->init(input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_internal_shape).reset_padding().set_is_resizable(true).set_num_channels(
+ num_channels).set_quantization_info(qinfo));
+ _memory_group.manage(&_output_internal);
+ output_internal = &_output_internal;
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_external_shape).reset_padding().set_is_resizable(true));
+ }
+
+ ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims));
// Configure reduction kernel
- _reduction_kernel.configure(input, output, axis, op);
+ _reduction_kernel.configure(input, output_internal, axis, op);
_window_split = reduction_window_split_dimension(axis);
_reduction_axis = axis;
@@ -150,7 +204,13 @@
default:
ARM_COMPUTE_ERROR("Reduction Operation unsupported");
}
- _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue);
+ _fill_border_kernel.configure(input, fill_border_size, (is_arg_min_max ? BorderMode::REPLICATE : BorderMode::CONSTANT), pixelValue);
+ }
+
+ if(_is_reshape_required)
+ {
+ _reshape_kernel.configure(output_internal, output);
+ _output_internal.allocator()->allocate();
}
}
@@ -161,5 +221,9 @@
NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
}
NEScheduler::get().schedule(&_reduction_kernel, _window_split);
+ if(_is_reshape_required)
+ {
+ NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
+ }
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 79a9496..f530a87 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -33,13 +33,15 @@
namespace arm_compute
{
-NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+template <bool IS_LOG>
+NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_kernel_ptr(nullptr), _fill_border_kernel(), _reshape_kernel(), _max(), _tmp(), _input_flattened(),
_output_flattened(), _needs_flattening(false)
{
}
-void NESoftmaxLayer::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, size_t axis)
+template <bool IS_LOG>
+void NESoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, size_t axis)
{
// Flatten the input
const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
@@ -68,11 +70,12 @@
auto_init_if_empty(*output->info(), *input->info()->clone());
}
-void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta, size_t axis)
+template <bool IS_LOG>
+void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, float beta, size_t axis)
{
// Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayer::validate(input->info(), output->info(), beta, axis));
+ ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayerGeneric::validate(input->info(), output->info(), beta, axis));
// We don't need flattening only in the case the input is 2D and axis is 1
_needs_flattening = axis != 1;
@@ -138,7 +141,8 @@
_tmp.allocator()->allocate();
}
-Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
+template <bool IS_LOG>
+Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
{
// Perform validation step
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -173,12 +177,13 @@
}
ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
- ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(&tensor_info_tmp, &tensor_info_max_sum, output, beta, &dont_care));
+ ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, output, beta, &dont_care));
return Status{};
}
-void NESoftmaxLayer::run()
+template <bool IS_LOG>
+void NESoftmaxLayerGeneric<IS_LOG>::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
@@ -196,4 +201,8 @@
NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
}
}
+
+template class NESoftmaxLayerGeneric<false>;
+template class NESoftmaxLayerGeneric<true>;
+
} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index e699ad1..6983c1c 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -33,6 +33,7 @@
#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
#include "support/ToolchainSupport.h"
+#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp"
namespace arm_compute
@@ -232,6 +233,31 @@
return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
}
+inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
+{
+ return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ||
+ act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
+}
+
+arm_gemm::Activation arm_gemm_activation_from_acl_activation(const ActivationLayerInfo &act_info)
+{
+ switch(act_info.activation())
+ {
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ {
+ return arm_gemm::Activation(arm_gemm::Activation::Type::ReLU, act_info.a(), act_info.b());
+ }
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ {
+ return arm_gemm::Activation(arm_gemm::Activation::Type::BoundedReLU, act_info.a(), act_info.b());
+ }
+ default:
+ {
+ return arm_gemm::Activation(arm_gemm::Activation::Type::None);
+ }
+ }
+}
+
} //namespace
NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
@@ -257,6 +283,8 @@
const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+
// Check if the Winograd configuration requires fast math
if(!enable_fast_math)
{
@@ -388,21 +416,15 @@
* data_type_size;
// Output storage
- const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels,
- use_same_padding)
- * data_type_size;
- ;
- const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
- const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
-
- const int output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
- const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
-
- const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+ const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size;
+ const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(out_channels, in_channels);
+ const int output_matrix_stride = transform_output_kernel->get_matrix_stride(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels);
+ const auto output_shape = transform_output_kernel->get_output_shape(in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
+ const int input_matrix_stride = transform_input_kernel->get_matrix_stride(in_shape.n_batches, in_channels, in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
// Configure GEMM
- const int tile_rows = iceildiv(output_shape.n_rows, output_tile.height);
- const int tile_cols = iceildiv(output_shape.n_cols, output_tile.width);
+ const int tile_rows = iceildiv(output_shape.first, output_tile.height);
+ const int tile_cols = iceildiv(output_shape.second, output_tile.width);
const int m = in_shape.n_batches * tile_rows * tile_cols;
const int k = in_shape.n_channels;
const int n = out_channels;
@@ -489,9 +511,19 @@
_memory_group.manage(&_output_nhwc);
output_to_use = &_output_nhwc;
}
- transform_output_kernel->configure(biases, &_output_transformed,
- output_matrix_stride, output_to_use,
- in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels, &_output_workspace);
+ const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info);
+
+ transform_output_kernel->configure(biases,
+ &_output_transformed,
+ output_matrix_stride,
+ output_to_use,
+ in_shape.n_batches,
+ output_shape.first,
+ output_shape.second,
+ out_channels,
+ &_output_workspace,
+ activation);
+
const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type());
_output_workspace.allocator()->init(output_workspace_info);
@@ -510,7 +542,7 @@
_transform_output_kernel = std::move(transform_output_kernel);
//Configure Activation Layer
- _is_activationlayer_enabled = act_info.enabled();
+ _is_activationlayer_enabled = act_info.enabled() && ! fuse_function_supported(act_info);
if(_is_activationlayer_enabled)
{
_activationlayer_function.configure(_output, nullptr, act_info);
@@ -546,7 +578,7 @@
_permute_output.run();
}
- if(_is_activationlayer_enabled)
+ if(_is_activationlayer_enabled )
{
_activationlayer_function.run();
}
diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
deleted file mode 100644
index ac809fa..0000000
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ /dev/null
@@ -1,426 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include "src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h"
-
-#include <atomic>
-#include <condition_variable>
-#include <mutex>
-
-namespace arm_compute
-{
-#ifndef NO_MULTI_THREADING
-class BufferManagerMultipleThreads final : public IBufferManager
-{
-public:
- /** Number of buffers to ping pong between */
- static constexpr unsigned int NUM_BUFFERS = 3;
-
- explicit BufferManagerMultipleThreads(unsigned int max_num_users)
- : _buffers(), _max_num_users(max_num_users)
- {
- }
- unsigned int num_buffers() const override
- {
- return NUM_BUFFERS;
- }
- /* - Lock the requested index if it's free and return true if it needs reshaping.
- * - Return false without acquiring the lock if the buffer at the index is already reshaped / being reshaped.
- * - Block if the corresponding buffer for the given index is still being used by a different index.
- */
- bool lock_to_reshape_if_needed(unsigned int index) override
- {
- Buffer &buf = get_buffer_from_index(index);
- while(true)
- {
- if(buf.index == index && buf.state != State::FREE)
- {
- //Another thread already is reshaping / has reshaped this block: nothing to do
- return false;
- }
- else
- {
- std::unique_lock<std::mutex> lock(buf.mutex);
- //If the buffer is free then lock it for reshaping:
- if(buf.state == State::FREE)
- {
- buf.index = index;
- buf.state = State::BEING_RESHAPED;
- return true;
- }
- // Check again just in case it changed while we were acquiring the lock:
- if(buf.index == index)
- {
- //Another thread is reshaping this block already, nothing to do
- return false;
- }
- // buf.index != index: Buffer still being used by another block, need to wait
- buf.sem.wait(lock);
- }
- }
- }
- /* Mark the buffer at the given index as reshaped and release the lock acquired via lock_to_reshape_if_needed() */
- void mark_as_reshaped(unsigned int index) override
- {
- Buffer &buf = get_buffer_from_index(index);
- {
- std::lock_guard<std::mutex> lock(buf.mutex);
- buf.users = _max_num_users;
- buf.state = State::IN_USE;
- }
- buf.sem.notify_all();
- }
-
- /* Block until the buffer at the given index is reshaped */
- void wait_for_reshaping(unsigned int index) override
- {
- Buffer &buf = get_buffer_from_index(index);
- ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
- // Check if it's already ready to use:
- if(buf.state == State::IN_USE)
- {
- return;
- }
- std::unique_lock<std::mutex> lock(buf.mutex);
- //Double check it didn't change while we were acquiring the lock:
- if(buf.state == State::IN_USE)
- {
- return;
- }
- buf.sem.wait(lock);
- }
- /* Mark the buffer at the given index as not used by this thread anymore.
- * Once all the threads have called this method then the buffer is marked as free again.
- */
- void mark_as_unused(unsigned int index) override
- {
- Buffer &buf = get_buffer_from_index(index);
- ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
- if(--buf.users == 0)
- {
- std::unique_lock<std::mutex> lock(buf.mutex);
- buf.state = State::FREE;
- lock.unlock();
- buf.sem.notify_all();
- }
- }
-
-private:
- enum class State
- {
- FREE,
- BEING_RESHAPED,
- IN_USE
- };
- struct Buffer
- {
- unsigned int index{};
- std::atomic_uint users{};
- State state{ State::FREE };
- std::mutex mutex{};
- std::condition_variable sem{};
- };
- std::array<struct Buffer, NUM_BUFFERS> _buffers;
- Buffer &get_buffer_from_index(unsigned int index)
- {
- return _buffers[index % NUM_BUFFERS];
- }
- unsigned int _max_num_users;
-};
-#endif /* NO_MULTI_THREADING */
-
-class BufferManagerSingleThread : public IBufferManager
-{
-public:
- unsigned int num_buffers() const override
- {
- return 1;
- }
- bool lock_to_reshape_if_needed(unsigned int index) override
- {
- ARM_COMPUTE_UNUSED(index);
- return true;
- }
- void mark_as_reshaped(unsigned int index) override
- {
- }
- void wait_for_reshaping(unsigned int index) override
- {
- }
- void mark_as_unused(unsigned int index) override
- {
- }
-};
-
-NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager))
-{
-}
-
-void NEGEMMInterleavedWrapper::run()
-{
- prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
- NEScheduler::get().run_tagged_workloads(_workloads, _tag.c_str());
-}
-
-void NEGEMMInterleavedWrapper::prepare()
-{
- if(!_is_prepared)
- {
- if(_pretranspose_b)
- {
- _transformed_b.allocator()->allocate();
- NEScheduler::get().schedule(_prepare_b.get(), Window::DimX);
- _b->mark_as_unused();
- }
- else
- {
- _prepare_b->create_workloads(_b_workloads);
- }
- _transform_a->create_workloads(_a_workloads);
- _matrix_multiply->create_workloads(_mm_workloads);
-
- //Maximum number of workloads to create:
- const unsigned int num_threads = NEScheduler::get().num_threads();
- const unsigned int max_iterations = std::max(num_threads, _num_windows);
- //Maximum number of iterations the parameters allow:
- const unsigned int num_iterations = _batch_window.num_iterations_total();
- // Keep the smallest of the two:
- const unsigned int num_windows = std::min(num_iterations, max_iterations);
- const TensorShape window_shape = _batch_window.shape();
- const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
-
- // Create a 1D window to dynamically split the batch window:
- Window win_1D;
- win_1D.set(0, Window::Dimension(0, num_iterations));
-
- // Create one workload for each sub-window:
- for(unsigned int w = 0; w < num_windows; w++)
- {
- Window win = win_1D.split_window(0, w, num_windows);
- const Coordinates start_offset = index2coords(window_shape, win.x().start());
- const Coordinates end_offset = index2coords(window_shape, win.x().end() - 1);
-
- if(_pretranspose_b)
- {
- auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
- {
- //For each block of rows in "M"
- auto workload_mm = this->_mm_workloads.begin();
- for(auto &workload_a : this->_a_workloads)
- {
- // Transform one k_block from A:
- this->_transform_a->transform(workload_a, info, this->_batch_window, start_offset, end_offset);
- // Then perform the matrix multiplication for each x block along N:
- for(unsigned int i = 0; i < num_x_blocks; i++)
- {
- ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
- this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
- }
- }
- };
- _workloads.emplace_back(workload);
- }
- else
- {
- auto workload = [num_threads, start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
- {
- //For each block of rows in "M"
- auto workload_mm = this->_mm_workloads.begin();
- unsigned int workload_b = 0;
- //If there is only one thread then only reshape the B blocks as you need them:
- unsigned int workload_b_next = num_threads == 1 ? this->_b_workloads.size() : 1;
-
- for(auto &workload_a : this->_a_workloads)
- {
- // Transform one k_block from A:
- this->_transform_a->transform(workload_a, info, this->_batch_window, start_offset, end_offset);
- // Then perform the matrix multiplication for each x block along N:
- for(unsigned int i = 0; i < num_x_blocks; i++)
- {
- ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
- if(workload_b_next < this->_b_workloads.size())
- {
- //Lock on BufferManager: need to run it ?
- if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
- {
- this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
- this->_buffer_manager->mark_as_reshaped(workload_b_next);
- }
- workload_b_next++;
- }
- ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
- // Run if needed or wait
- if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
- {
- this->_prepare_b->transform(this->_b_workloads[workload_b], info);
- this->_buffer_manager->mark_as_reshaped(workload_b);
- }
- this->_buffer_manager->wait_for_reshaping(workload_b);
- this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
- this->_buffer_manager->mark_as_unused(workload_b);
- workload_b++;
- }
- }
- };
- _workloads.emplace_back(workload);
- }
- }
- if(!_pretranspose_b && num_windows > 1 && num_windows % num_threads != 0)
- {
- //Make sure the number of workloads is a multiple of the number of threads to avoid dead locks:
- for(unsigned int leftover = num_windows % num_threads; leftover != num_threads; leftover++)
- {
- auto workload = [this](const ThreadInfo & info)
- {
- unsigned int workload_b = 0;
- //If there is only one thread then only reshape the B blocks as you need them:
- unsigned int workload_b_next = 1;
-
- for(unsigned int iteration = 0; iteration < this->_mm_workloads.size(); iteration++)
- {
- if(workload_b_next < this->_b_workloads.size())
- {
- //Lock on BufferManager: need to run it ?
- if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
- {
- this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
- this->_buffer_manager->mark_as_reshaped(workload_b_next);
- }
- workload_b_next++;
- }
- ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
- // Run if needed or wait
- if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
- {
- this->_prepare_b->transform(this->_b_workloads[workload_b], info);
- this->_buffer_manager->mark_as_reshaped(workload_b);
- }
- this->_buffer_manager->wait_for_reshaping(workload_b);
- this->_buffer_manager->mark_as_unused(workload_b);
- workload_b++;
- }
- };
- _workloads.emplace_back(workload);
- }
- }
-
- _is_prepared = true;
- }
-}
-
-void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info)
-{
- _params = INEGEMMWrapperKernel::extract_parameters(a, b, c, gemm_info);
- _a = a;
- _b = b;
- _c = c;
- _pretranspose_b = gemm_info.pretranpose_B();
-
- const DataType input_type = a->info()->data_type();
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- const unsigned int num_threads = NEScheduler::get().num_threads();
-
- const arm_gemm::KernelDescription gemm_kernel_info = get_gemm_info(input_type, ci, num_threads, _params, alpha, beta, _pretranspose_b);
- ARM_COMPUTE_ERROR_ON(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMM_INTERLEAVED);
-
- // Forcing 128-byte alignment (required by 32-bit kernels)
- const unsigned int alignment = 128;
- _transformed_b.allocator()->init(TensorInfo{}, alignment);
- _tmp_c.allocator()->init(TensorInfo{}, alignment);
- _tag = "NEGEMMInterleaved_" + gemm_kernel_info.name;
-
- // Get strategy
- std::unique_ptr<detail::IInterleavedStrategy> strategy = detail::create_strategy(gemm_kernel_info.name);
- _num_windows = iceildiv(_params.M, strategy->out_height()) * _params.batches;
- ARM_COMPUTE_ERROR_ON(strategy == nullptr);
-
- if(!_pretranspose_b)
- {
- _block_sizes = strategy->calculate_block_sizes_for_strategy(ci, _params);
- _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
- _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
- // If the execution is single threaded or has only one window then the buffer manager only needs 1 buffer else we will use NUM_BUFFERS buffers and ping pong between them:
- const unsigned int num_iterations = _batch_window.num_iterations_total();
- if(NEScheduler::get().num_threads() == 1 || num_iterations == 1)
- {
- _buffer_manager = support::cpp14::make_unique<BufferManagerSingleThread>();
- }
- else
- {
-#ifdef NO_MULTI_THREADING
- ARM_COMPUTE_ERROR("Can't have more than 1 buffer without multiple threads");
-#else /* NO_MULTI_THREADING */
- _buffer_manager = support::cpp14::make_unique<BufferManagerMultipleThreads>(NEScheduler::get().num_threads());
-#endif /* NO_MULTI_THREADING */
- }
- // If B is transposed at every iteration then transformed_B can be managed:
- _memory_group.manage(&_transformed_b);
- auto_init_if_empty(*_transformed_b.info(), _b->info()->clone()->set_tensor_shape(TensorShape(_block_sizes.x_block * _block_sizes.k_block, _buffer_manager->num_buffers())));
- }
- else
- {
- _tag += "_preB";
- }
-
- _prepare_b = strategy->instantiate_prepareB(b, &_transformed_b, _params, ci);
- ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
-
- if(_pretranspose_b)
- {
- _block_sizes = _prepare_b->block_sizes();
- _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
- _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
- }
-
- _block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block));
- _block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block));
- _block_walker.set(Window::DimZ, Window::Dimension(0, _params.multis));
-
- _transformed_a.allocator()->init(TensorInfo(TensorShape{ _block_sizes.k_block, _block_sizes.m_round, _params.batches }, 1, input_type), alignment);
- _memory_group.manage(&_transformed_a);
- _memory_group.manage(&_tmp_c);
-
- _transform_a = strategy->instantiate_transformA(_a, &_transformed_a, _block_walker, _params, gemm_info);
- _matrix_multiply = strategy->instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, alpha, beta, gemm_info, num_threads);
- ARM_COMPUTE_ERROR_ON(_transform_a == nullptr);
- ARM_COMPUTE_ERROR_ON(_matrix_multiply == nullptr);
-
- _transformed_a.allocator()->allocate();
- _tmp_c.allocator()->allocate();
- if(!_pretranspose_b)
- {
- _transformed_b.allocator()->allocate();
- }
-}
-} // namespace arm_compute