arm_compute v18.02
Change-Id: I7207aa488e5470f235f39b6c188b4678dc38d1a6
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 2d08b45..95fcf88 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -26,14 +26,18 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "support/ToolchainSupport.h"
using namespace arm_compute;
+using namespace arm_compute::misc;
+using namespace arm_compute::misc::shape_calculator;
NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
- : _kernel(), _output_stage_kernel(), _border_handler(), _accumulator(), _has_bias(false), _is_quantized(false)
+ : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _input_nhwc(), _weights_hwio(), _output_nhwc(), _has_bias(false),
+ _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false)
{
}
@@ -46,30 +50,61 @@
_is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
_has_bias = biases != nullptr;
+ _is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
+ conv_info,
+ input->info()->data_type());
+ _are_weights_reshaped = false;
- // Allocate the intermediate accumulator tensor in case of fixed point input
- if(_is_quantized)
+ if(_is_optimized)
{
- _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::S32));
- _accumulator.info()->set_quantization_info(input->info()->quantization_info());
- zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+ // Configure the function to transform the input tensor from NCHW -> NHWC
+ _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+
+ // Configure the function to transform the weights tensor from IHW -> HWI
+ _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+
+ // Configure optimized depthwise
+ _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, DataLayout::NHWC);
+
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
+
+ // Allocate tensors
+ _input_nhwc.allocator()->allocate();
+ _weights_hwio.allocator()->allocate();
+ _output_nhwc.allocator()->allocate();
+
+ // Create convolver (deferred)
+ _dwc_kernel.generate_convolver();
}
+ else
+ {
+ // Allocate the intermediate accumulator tensor in case of fixed point input
+ if(_is_quantized)
+ {
+ _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::S32));
+ _accumulator.info()->set_quantization_info(input->info()->quantization_info());
+ zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+ }
- // Configure depthwise convolution kernel
- _kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info);
+ // Configure depthwise convolution kernel
+ _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info);
- // Configure border handler
- _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value);
+ // Configure border handler
+ _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+ }
// Configure biases accumulation
if(_has_bias || _is_quantized)
{
if(_is_quantized)
{
- float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
+ const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
int output_multiplier, output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_accumulator, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
+ _output_stage_kernel.configure(&_accumulator, biases, output, output_multiplier, output_shift, output_quant_info.offset);
_accumulator.allocator()->allocate();
}
else
@@ -81,8 +116,35 @@
void NEDepthwiseConvolutionLayer3x3::run()
{
- NEScheduler::get().schedule(&_border_handler, Window::DimX);
- NEScheduler::get().schedule(&_kernel, Window::DimX);
+ // Permute weights in HWIO format if the optimized kernel will be executedd
+ if(!_are_weights_reshaped && _is_optimized)
+ {
+ _are_weights_reshaped = true;
+ _permute_weights.run();
+ }
+
+ // Handle input
+ if(_is_optimized)
+ {
+ // Permute input to NHWC format execution
+ _permute_input.run();
+ }
+ else
+ {
+ // Fill border in NCHW format execution
+ NEScheduler::get().schedule(&_border_handler, Window::DimX);
+ }
+
+ // Execute depthwise convolution
+ NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
+
+ // Permute output to ACL's native NCHW format in case of NHWC execution
+ if(_is_optimized)
+ {
+ _permute_output.run();
+ }
+
+ // Add biases
if(_has_bias || _is_quantized)
{
NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
@@ -90,13 +152,14 @@
}
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
- : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _input_reshaped(), _weights_reshaped(), _v2mm_output()
+ : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
+ _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false)
{
}
void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
@@ -104,14 +167,20 @@
const size_t weights_h = weights->info()->dimension(1);
const size_t weights_z = weights->info()->dimension(2);
- bool has_bias = (biases != nullptr);
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- unsigned int conv_w = 0;
- unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
+ // Should bias be appended ?
+ bool append_bias = (biases != nullptr) && !_is_quantized;
+
+ // Calculate output shape
+ TensorShape dwc_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
+
+ // Output width and height
+ const unsigned int conv_w = dwc_output_shape.x();
+ const unsigned int conv_h = dwc_output_shape.y();
// Set up intermediate tensors
- const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0);
+ const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
const size_t conv_size = conv_w * conv_h;
// Im2Col configuration
@@ -119,25 +188,50 @@
shape_im2col.set(0, patch_size);
shape_im2col.set(1, conv_size);
shape_im2col.set(2, weights_z);
- const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
- _input_reshaped.allocator()->init(info_im2col);
- _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias);
+ _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+ _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias);
// Weights reshape configuration
const TensorShape shape_weights_reshape(patch_size, weights_z);
- const TensorInfo info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
- _weights_reshaped.allocator()->init(info_weights_reshape);
- _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases);
+ _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+ _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
// GEMV configuration
+ DataType v2mm_dt = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
TensorShape shape_v2mm_out = input->info()->tensor_shape();
shape_v2mm_out.set(0, conv_size * weights_z);
shape_v2mm_out.set(1, 1);
shape_v2mm_out.set(2, 1);
- const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
- _v2mm_output.allocator()->init(info_v2mm_out);
+ _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
_v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
- _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
+ _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dwc_output_shape));
+ _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
+
+ // Output staged configuration
+ if(_is_quantized)
+ {
+ const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+ int output_multiplier, output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
+ _output_reshaped.allocator()->allocate();
+ }
+
+ // Fill borders on inputs
+ PixelValue zero_in(static_cast<int32_t>(0));
+ PixelValue zero_w(static_cast<int32_t>(0));
+ if(_is_quantized)
+ {
+ zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
+ zero_w = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
+ }
+ BorderSize border_size = _v2mm_kernel.border_size();
+ _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
+
+ border_size.bottom = 0;
+ _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
// Allocate intermediate tensors
_input_reshaped.allocator()->allocate();
@@ -149,6 +243,12 @@
{
NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+ NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX);
+ NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
-}
\ No newline at end of file
+ if(_is_quantized)
+ {
+ NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
+ }
+}