arm_compute v18.01
Change-Id: I9bfa178c2e38bfd5fc812e62aab6760d87748e05
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index f10ffa6..b84dfd3 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -54,7 +54,8 @@
}
template <unsigned int matrix_size>
-void NEConvolutionSquare<matrix_size>::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+void NEConvolutionSquare<matrix_size>::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
+ uint8_t constant_border_value)
{
ARM_COMPUTE_ERROR_ON(conv == nullptr);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 25c639f..8f7d940 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -44,6 +44,16 @@
namespace arm_compute
{
+namespace
+{
+TensorShape get_reshaped_weights_shape(const ITensorInfo *weights, bool has_bias)
+{
+ const unsigned int mat_weights_cols = weights->dimension(3);
+ const unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (has_bias ? 1 : 0);
+ return TensorShape(mat_weights_cols, mat_weights_rows);
+}
+} // namespace
+
NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
{
@@ -51,18 +61,12 @@
void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
- ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
-
- if(biases != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
- ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
- ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
- }
+ // Perform validation step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(),
+ (biases != nullptr) ? biases->info() : nullptr,
+ output->info(),
+ transpose1xW));
// Check if bias are present, if yes they will be embedded to the weights matrix
const bool _has_bias = (biases != nullptr);
@@ -72,10 +76,7 @@
if(transpose1xW)
{
// Create tensor to store the reshaped weights
- const unsigned int mat_weights_cols = weights->info()->dimension(3);
- const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
- TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
- TensorInfo info_wr(shape_wr, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+ TensorInfo info_wr = weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(get_reshaped_weights_shape(weights->info(), _has_bias));
_weights_reshaped.allocator()->init(info_wr);
_memory_group.manage(&_weights_reshaped);
@@ -91,6 +92,46 @@
}
}
+Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose1xW)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+
+ // Check if bias are present, if yes they will be embedded to the weights matrix
+ const bool has_bias = (biases != nullptr);
+
+ // Checks performed when biases are present
+ if(has_bias)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+
+ if(transpose1xW)
+ {
+ TensorInfo weights_reshaped = weights->clone()->set_tensor_shape(get_reshaped_weights_shape(weights, has_bias));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(&weights_reshaped, output));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, output));
+ }
+
+ return Status{};
+}
+
void NEConvolutionLayerReshapeWeights::run()
{
_memory_group.acquire();
@@ -105,6 +146,62 @@
_memory_group.release();
}
+namespace
+{
+TensorShape get_reshaped_weights_shape_conv(const ITensorInfo *weights, bool has_bias, bool is_fully_connected_convolution)
+{
+ unsigned int mat_weights_cols = weights->dimension(3);
+ unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (has_bias ? 1 : 0);
+
+ if(is_fully_connected_convolution)
+ {
+ // Create tensor to store the reshaped weights
+ return TensorShape(mat_weights_cols, mat_weights_rows);
+ }
+ else
+ {
+ // Create tensor to store transposed weights
+ const float transpose_width = 16.0f / weights->element_size();
+ return TensorShape(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
+ }
+}
+
+Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, DataType &dt,
+ bool &has_bias,
+ bool &are_weights_reshaped, unsigned int &kernel_width, unsigned int &kernel_height, bool &is_fully_connected_convolution, unsigned int &mat_weights_cols, unsigned int &mat_weights_rows,
+ unsigned int &conv_w, unsigned int &conv_h)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && weights->dimension(2) != input->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && biases->dimension(0) != weights->dimension(3));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+
+ dt = input->data_type();
+ has_bias = (biases != nullptr);
+ are_weights_reshaped = weights_info.are_reshaped();
+ kernel_width = (are_weights_reshaped) ? weights_info.kernel_size().first : weights->dimension(0);
+ kernel_height = (are_weights_reshaped) ? weights_info.kernel_size().second : weights->dimension(1);
+ mat_weights_cols = weights->dimension(3);
+ mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (has_bias ? 1 : 0);
+
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+ conv_info);
+
+ is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+
+ return Status{};
+}
+} // namespace
+
NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _output_col2im_kernel(),
_input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _workspace(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
@@ -113,42 +210,25 @@
void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
- ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
- ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- if(biases != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
- ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
- ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
- }
+ DataType dt{};
+ unsigned int kernel_width = 0;
+ unsigned int kernel_height = 0;
+ unsigned int mat_weights_cols = 0;
+ unsigned int mat_weights_rows = 0;
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
+ Status status = validate_and_initialize_values(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), conv_info, weights_info, dt, _has_bias, _are_weights_reshaped,
+ kernel_width, kernel_height,
+ _is_fully_connected_convolution,
+ mat_weights_cols, mat_weights_rows, conv_w, conv_h);
- _has_bias = (biases != nullptr);
- _are_weights_reshaped = weights_info.are_reshaped();
+ ARM_COMPUTE_ERROR_THROW_ON(status);
- // Get parameters from conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
- std::tie(stride_x, stride_y) = conv_info.stride();
-
- // Get convolved dimensions
- unsigned int conv_w = 0;
- unsigned int conv_h = 0;
-
- const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0);
- const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1);
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
- conv_info);
-
- // Check if its a "fully connected" convolution, i.e. the output size is 1x1xnum_kernels
- _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+ const unsigned int fixed_point_position = input->info()->fixed_point_position();
#if defined(__arm__)
if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
@@ -162,9 +242,6 @@
}
#endif /* defined(__arm__) || defined(__aarch64__) */
- unsigned int mat_weights_cols = weights->info()->dimension(3);
- unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
-
// Reshape weights if needed
if(_mm_optimised_kernel != nullptr)
{
@@ -230,7 +307,7 @@
shape_im2col.set(0, mat_input_cols);
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
- _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+ _input_im2col_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
_memory_group.manage(&_input_im2col_reshaped);
// Create tensor (interleave) to prepare input tensor for GEMM
@@ -239,7 +316,7 @@
TensorShape shape_interleaved(shape_im2col);
shape_interleaved.set(0, shape_interleaved.x() * 4);
shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
- _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+ _input_interleaved_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_interleaved));
_memory_group.manage(&_input_interleaved_reshaped);
}
@@ -247,7 +324,7 @@
TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, mat_input_rows);
- _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
+ _gemm_output.allocator()->init(_input_im2col_reshaped.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_gemm));
_memory_group.manage(&_gemm_output);
// Configure kernels
@@ -273,14 +350,7 @@
_memory_group.manage(&_workspace);
// Configure matrix multiplication kernel
- if(_is_fully_connected_convolution)
- {
- _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f);
- }
- else
- {
- _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
- }
+ _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
_workspace.allocator()->allocate();
}
@@ -303,8 +373,6 @@
_output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
_gemm_output.allocator()->allocate();
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
-
// Allocate intermediate tensor
if(!_are_weights_reshaped)
{
@@ -312,6 +380,128 @@
}
}
+Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info)
+{
+ DataType dt{};
+ bool has_bias{};
+ bool are_weights_reshaped{};
+ bool is_fully_connected_convolution{};
+ unsigned int kernel_width = 0;
+ unsigned int kernel_height = 0;
+ unsigned int mat_weights_cols = 0;
+ unsigned int mat_weights_rows = 0;
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+
+ Status status = validate_and_initialize_values(input, weights, biases, conv_info, weights_info, dt, has_bias, are_weights_reshaped, kernel_width, kernel_height,
+ is_fully_connected_convolution, mat_weights_cols, mat_weights_rows,
+ conv_w, conv_h);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(status);
+
+ std::unique_ptr<ITensorInfo> reshaped_weights = weights->clone();
+ bool optimised_kernel = false;
+
+#if defined(__arm__)
+ if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+ {
+ optimised_kernel = true;
+ }
+#elif defined(__aarch64__)
+ if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
+ {
+ optimised_kernel = true;
+ }
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+ // Reshape weights if needed
+ if(optimised_kernel)
+ {
+ if(are_weights_reshaped)
+ {
+ mat_weights_cols = weights_info.num_kernels();
+ mat_weights_rows = weights->dimension(1);
+ }
+ else
+ {
+ TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+
+ // Create tensor to store the reshaped weights
+ reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, has_bias, is_fully_connected_convolution));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
+ weights = reshaped_weights.get();
+ }
+ }
+ else
+ {
+ if(are_weights_reshaped)
+ {
+ const unsigned int transpose_width = 16 / input->element_size();
+ mat_weights_cols = weights_info.num_kernels();
+ mat_weights_rows = weights->dimension(0) / transpose_width + (has_bias ? 1 : 0);
+ }
+ else
+ {
+ TensorShape reshaped_weights_shape;
+
+ if(is_fully_connected_convolution)
+ {
+ reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
+ }
+ else
+ {
+ // Create tensor to store transposed weights
+ const float transpose_width = 16.0f / input->element_size();
+ reshaped_weights_shape = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
+ static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
+ }
+
+ // Create tensor to store the reshaped weights
+ reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, has_bias, is_fully_connected_convolution));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
+ weights = reshaped_weights.get();
+ }
+ }
+
+ // Validate im2col
+ const unsigned int mat_input_cols = mat_weights_rows;
+ const unsigned int mat_input_rows = conv_w * conv_h;
+ TensorShape shape_im2col = input->tensor_shape();
+ shape_im2col.set(0, mat_input_cols);
+ shape_im2col.set(1, mat_input_rows);
+ shape_im2col.set(2, 1);
+ TensorInfo im2_col_info = input->clone()->set_tensor_shape(shape_im2col);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2_col_info, Size2D(weights->dimension(0), weights->dimension(1)), conv_info, has_bias));
+
+ // Create GEMM output tensor
+ TensorShape shape_gemm(im2_col_info.tensor_shape());
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, mat_input_rows);
+ TensorInfo gemm_output_info = input->clone()->set_tensor_shape(shape_gemm);
+
+ // Validate GEMM interleave and multiply
+ if(!is_fully_connected_convolution)
+ {
+ TensorShape shape_interleaved = shape_im2col;
+ shape_interleaved.set(0, shape_interleaved.x() * 4);
+ shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+ TensorInfo input_interleaved_info = input->clone()->set_tensor_shape(shape_interleaved);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(&im2_col_info, &input_interleaved_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&input_interleaved_info, weights, &gemm_output_info));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&im2_col_info, weights, &gemm_output_info));
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+ return Status{};
+}
+
void NEConvolutionLayer::run()
{
// Run weights reshaping (Runs once for every configure)
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 7b4e77b..7bce8a6 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,39 +24,43 @@
#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
: _memory_group(std::move(memory_manager)),
- _scale_f(),
_conv_f(),
- _scaled_output()
+ _scaled_output(),
+ _input(nullptr),
+ _info(),
+ _inner_border()
{
}
void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info,
- unsigned int ax, unsigned int ay, float upscalex, float upscaley)
+ unsigned int inner_border_right, unsigned int inner_border_top)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) < 1);
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 1 && weights->info()->dimension(0) != 3 && weights->info()->dimension(0) != 5);
- auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
- info.pad().first, info.pad().second, ax, ay, upscalex, upscaley, info.round());
+ _input = input;
+ _info = info;
+ _inner_border = std::make_pair(inner_border_right, inner_border_top);
+
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+ auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+ info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights, bias);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, weights, bias);
-
+ ARM_COMPUTE_UNUSED(output_shape);
ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
@@ -64,51 +68,51 @@
_memory_group.manage(&_scaled_output);
// configure scale function
- //Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor
- TensorShape scale_out_shape(input->info()->tensor_shape());
- scale_out_shape.set(0, output->info()->dimension(0));
- scale_out_shape.set(1, output->info()->dimension(1));
- TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+ // Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor
+ const TensorInfo scale_out_info(compute_deconvolution_shape(*input->info(), stride_x, stride_y, inner_border_right, inner_border_top, info), 1, input->info()->data_type(),
+ input->info()->fixed_point_position());
_scaled_output.allocator()->init(scale_out_info);
- const unsigned int kernel_size = weights->info()->dimension(0);
- // Padding for the upsampled image is calculated with the equiation: p' = k - p - 1, where k is kernel size and p is the input padding
- ARM_COMPUTE_ERROR_ON(info.pad().first > (kernel_size - 1));
- const unsigned int tr_px = kernel_size - info.pad().first - 1;
- const unsigned int tr_py = kernel_size - info.pad().second - 1;
- const unsigned int tr_stride = 1;
- const PadStrideInfo transposed_info(tr_stride, tr_stride, tr_px, tr_py);
- _scale_f.configure(input, &_scaled_output, std::make_pair(ax, ay), std::make_pair(info.stride().first - 1u, info.stride().second - 1u), transposed_info);
+
// setup the function to convolve the upscaled output
- switch(kernel_size)
- {
- case 1:
- {
- _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
- break;
- }
- case 3:
- {
- _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
- break;
- }
- case 5:
- {
- _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 2, 2, DimensionRoundingType::CEIL));
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Not supported");
- break;
- }
- }
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+ _conv_f.configure(&_scaled_output, weights, bias, output, conv_info);
_scaled_output.allocator()->allocate();
}
void NEDeconvolutionLayer::run()
{
_memory_group.acquire();
- _scale_f.run();
+
+ // Initialize _scaled_output buffer
+ const int width_in = _input->info()->dimension(0);
+ const int height_in = _input->info()->dimension(1);
+ const int width_scaled = _scaled_output.info()->dimension(0);
+ const int height_scaled = _scaled_output.info()->dimension(1);
+ const int num_2d_slices = _input->info()->tensor_shape().total_size() / (width_in * height_in);
+ const int stride_x = _info.stride().first;
+ const int stride_y = _info.stride().second;
+
+ std::fill_n(reinterpret_cast<float *>(_scaled_output.buffer()), _scaled_output.info()->tensor_shape().total_size(), 0.f);
+
+ // scaled_output is the input for the forward convolution. We copy the input elements to scaled_output
+ // and insert rows and columns with zeroes depending on the stride values.
+ for(int slice = 0; slice < num_2d_slices; ++slice)
+ {
+ const int start_x = _info.pad().first;
+ const int start_y = _inner_border.second + _info.pad().second;
+ const int end_y = height_scaled - _info.pad().second;
+ const int end_x = width_scaled - _inner_border.first - _info.pad().first;
+
+ for(int yi = start_y, in_y = 0; yi < end_y; yi += stride_y, in_y++)
+ {
+ for(int xi = start_x, in_x = 0; xi < end_x; xi += stride_x, in_x++)
+ {
+ const auto in = *(reinterpret_cast<float *>(_input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(in_x, in_y, slice))));
+ *(reinterpret_cast<float *>(_scaled_output.buffer() + _scaled_output.info()->offset_element_in_bytes(Coordinates(xi, yi, slice)))) = in;
+ }
+ }
+ }
+
_conv_f.run();
_memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp
deleted file mode 100644
index 79b9b2d..0000000
--- a/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "support/ToolchainSupport.h"
-
-#include <cmath>
-#include <cstddef>
-#include <utility>
-
-using namespace arm_compute;
-
-namespace
-{
-inline void precompute_offsets(ITensor *offsets, float wr, size_t input_element_size, const std::pair<unsigned int, unsigned int> &a,
- const std::pair<unsigned int, unsigned int> &iz, const PadStrideInfo &info)
-{
- ARM_COMPUTE_ERROR_ON(nullptr == offsets);
- Window win;
- const int padx = info.pad().first;
- const int pady = info.pad().second;
- const int ax = a.first;
- const int ay = a.second;
- const int offset_width = offsets->info()->dimension(0);
- const int offset_height = offsets->info()->dimension(1);
- // The values of ax and ay denote the number of ZEROS to be added on the top and right inner border of the image.
- // Step value along the XY axis will depend on the number of zeros to be inserted between samples (number of zeros + 1).
- // Pre-compute the X offset, Y's stride is unknown at this point so we can't precompute Y's offsets
- for(int yi = ay; yi < (offset_height - pady); yi += (1 + iz.second))
- {
- for(int xi = padx; xi < (offset_width - ax); xi += (1 + iz.first))
- {
- int *ptr = reinterpret_cast<int *>(offsets->ptr_to_element(Coordinates(xi, yi)));
- const size_t in_xi = (xi + 0.5f) * wr;
- *reinterpret_cast<int32_t *>(ptr) = in_xi * input_element_size;
- }
- }
-}
-} // namespace
-
-NEDeconvolutionLayerUpsample::NEDeconvolutionLayerUpsample(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
- : _memory_group(std::move(memory_manager)),
- _offsets(),
- _border_handler(),
- _upsample()
-{
-}
-
-void NEDeconvolutionLayerUpsample::configure(ITensor *input, ITensor *output, const std::pair<unsigned int, unsigned int> &a,
- const std::pair<unsigned int, unsigned int> &iz, const PadStrideInfo &info)
-{
- ARM_COMPUTE_ERROR_ON(nullptr == input);
- ARM_COMPUTE_ERROR_ON(nullptr == output);
-
- for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
- {
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
- }
-
- // Get the tensor shape
- const TensorShape shape(output->info()->dimension(0), output->info()->dimension(1));
-
- // Compute the ratio between source width/height and destination width/height
- const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
- const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
- ARM_COMPUTE_UNUSED(hr);
- // Get the element size of the input image
- const size_t input_element_size = input->info()->element_size();
-
- TensorInfo tensor_info_offsets(shape, Format::S32);
- _offsets.allocator()->init(tensor_info_offsets);
-
- _upsample.configure(input, &_offsets, output);
-
- // Allocate once the configure methods have been called
- _offsets.allocator()->allocate();
- // Pre-compute offsets for nearest interpolation
- std::fill_n(reinterpret_cast<int32_t *>(_offsets.buffer()), _offsets.info()->total_size() / sizeof(int32_t), -1 * input_element_size);
- precompute_offsets(&_offsets, wr, input_element_size, a, iz, info);
-
- _border_handler.configure(input, _upsample.border_size(), BorderMode::CONSTANT, PixelValue(0.f));
-}
-
-void NEDeconvolutionLayerUpsample::run()
-{
- NEScheduler::get().schedule(&_border_handler, Window::DimZ);
- _memory_group.acquire();
- NEScheduler::get().schedule(&_upsample, Window::DimY);
- _memory_group.release();
-}
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index b890c6f..2d08b45 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,28 +26,56 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "support/ToolchainSupport.h"
using namespace arm_compute;
NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
- : _kernel(), _bias_kernel(), _border_handler(), _has_bias(false)
+ : _kernel(), _output_stage_kernel(), _border_handler(), _accumulator(), _has_bias(false), _is_quantized(false)
{
}
void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- // Call convolution kernel
- _kernel.configure(input, weights, output, conv_info);
- _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
- if(biases != nullptr)
+ PixelValue zero_value(0.f);
+
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _has_bias = biases != nullptr;
+
+ // Allocate the intermediate accumulator tensor in case of fixed point input
+ if(_is_quantized)
{
- _bias_kernel.configure(output, biases);
- _has_bias = true;
+ _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::S32));
+ _accumulator.info()->set_quantization_info(input->info()->quantization_info());
+ zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+ }
+
+ // Configure depthwise convolution kernel
+ _kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info);
+
+ // Configure border handler
+ _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+ // Configure biases accumulation
+ if(_has_bias || _is_quantized)
+ {
+ if(_is_quantized)
+ {
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
+ int output_multiplier, output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _output_stage_kernel.configure(&_accumulator, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
+ _accumulator.allocator()->allocate();
+ }
+ else
+ {
+ _output_stage_kernel.configure(output, biases);
+ }
}
}
@@ -55,9 +83,9 @@
{
NEScheduler::get().schedule(&_border_handler, Window::DimX);
NEScheduler::get().schedule(&_kernel, Window::DimX);
- if(_has_bias)
+ if(_has_bias || _is_quantized)
{
- NEScheduler::get().schedule(&_bias_kernel, Window::DimX);
+ NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
}
}
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index afa5d97..c26c99a 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -34,7 +34,7 @@
using namespace arm_compute;
NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator(), _has_bias(false)
+ : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _accumulator(), _has_bias(false), _is_fixed_point(false)
{
}
@@ -50,16 +50,16 @@
_has_bias = (bias != nullptr);
// Allocate the intermediate accumulator tensor in case of fixed point input
- if(is_data_type_fixed_point(input->info()->data_type()))
+ _is_fixed_point = is_data_type_fixed_point(input->info()->data_type());
+ if(_is_fixed_point)
{
const DataType promoted_dt = (input->info()->data_type() == DataType::QS8) ? DataType::QS16 : DataType::QS32;
_accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, promoted_dt, output->info()->fixed_point_position()));
_memory_group.manage(&_accumulator);
_conv_kernel.configure(input, weights, &_accumulator, conv_info);
- if(_has_bias)
- {
- _accumulate_bias_kernel.configure(&_accumulator, bias, output);
- }
+
+ // When no bias is provided, we need to downscale the accumulator tensor
+ _output_stage_kernel.configure(&_accumulator, bias, output);
_accumulator.allocator()->allocate();
}
else
@@ -67,7 +67,7 @@
_conv_kernel.configure(input, weights, output, conv_info);
if(_has_bias)
{
- _accumulate_bias_kernel.configure(output, bias);
+ _output_stage_kernel.configure(output, bias);
}
}
@@ -90,20 +90,17 @@
// Validate Convolution kernel
ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerKernel::validate(input, weights, &accumulator, conv_info));
- // Validate bias
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias == nullptr) && is_data_type_fixed_point(data_type),
- "Biases should be provided for fixed point inputs");
if(bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
"Biases size and number of input feature maps should match");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
-
- // Validate bias kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerBiasAccumulateKernel::validate(&accumulator, bias, output));
}
+ // Validate bias kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, bias, output));
+
return Status{};
}
@@ -114,10 +111,9 @@
_memory_group.acquire();
NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
- if(_has_bias)
+ if(_has_bias || _is_fixed_point)
{
- NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_output_stage_kernel, Window::DimY);
}
-
_memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 950f4c9..e640b06 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -50,15 +50,17 @@
{
NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
- _run_vector_matrix_multiplication(false), _run_addition(false)
+ _run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
{
}
-void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
+void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+ ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
if(c != nullptr)
{
@@ -70,6 +72,8 @@
ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
}
+ // Check if we need to reshape the matrix B only on the first run
+ _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
// Check if the first input tensor is a vector.
@@ -142,7 +146,7 @@
_memory_group.manage(&_workspace);
// Configure matrix multiplication kernel
- _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f);
+ _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
_workspace.allocator()->allocate();
}
else
@@ -207,8 +211,18 @@
// Run interleave kernel
NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
- // Run transpose kernel
- NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+ if(_is_first_run)
+ {
+ // Run transpose kernel
+ NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+
+ _is_first_run = false;
+ }
+ else if(!_reshape_b_only_on_first_run)
+ {
+ // Run transpose kernel
+ NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+ }
}
NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 6e03ffa..9b36e81 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -74,7 +74,7 @@
#endif /* __aarch64__ */
#ifdef ARM_COMPUTE_AARCH64_V8_2
- if(ci.CPU == CPUTarget::A75_DOT)
+ if(ci.CPU == CPUTarget::A75_DOT || ci.CPU == CPUTarget::A55_DOT)
{
// Configure matrix multiply kernel
GemmInterleaved<gemm_s8_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
@@ -87,10 +87,6 @@
_mm_kernel = std::move(k);
_workspace.allocator()->allocate();
}
- else if(ci.CPU == CPUTarget::A55_DOT)
- {
- ARM_COMPUTE_ERROR_ON("WIP");
- }
else
#elif defined(ARM_COMPUTE_AARCH64_V8A)
if(ci.CPU == CPUTarget::A53)
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 50aa5b6..c4028dc 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -84,7 +84,7 @@
// Configure matrix multiplication kernel
auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
- k->configure(a, b, output, &_workspace, 1.f, 1.f);
+ k->configure(a, b, output, &_workspace, 1.f, 1.f, false, false);
_mm_kernel = std::move(k);
}
else
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 530c7fc..8a32507 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,9 +43,14 @@
// Configure pooling kernel
_pooling_layer_kernel.configure(input, output, pool_info);
- // Configure border depending on operation required
+ // Configure border depending on operation required (quantize border in case of asymmetric data_type)
BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
- _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, PixelValue(static_cast<float>(0.f)));
+ PixelValue zero_value(0.f);
+ if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+ {
+ zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+ }
+ _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value);
}
Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp
index 3251de4..da46f87 100644
--- a/src/runtime/NEON/functions/NEWinogradLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,7 +43,8 @@
namespace arm_compute
{
NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _winograd_kernel(), _weights_workspace(), _workspace(), _kernel_storage(), _input(), _weights(), _output(), _reshaped_kernel(false), _conv()
+ : _memory_group(std::move(memory_manager)), _winograd_kernel(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(),
+ _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(), _reshaped_kernel(false), _conv()
{
} /* arm_compute */
@@ -71,85 +72,107 @@
ARM_COMPUTE_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
// Get convolved dimensions
- auto padding = PADDING_VALID;
- const int in_channels = input->info()->dimension(2);
+ const int in_channels = input->info()->dimension(2);
+ const int out_channels = output->info()->dimension(2);
- const int out_channels = output->info()->dimension(2);
- const int weights_width = weights->info()->dimension(0);
- const int weights_height = weights->info()->dimension(1);
-
- const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels });
const Tensor4DShape in_shape(internal_get_input_shape(input));
// Get the memory required to instantiate a new Winograd operator.
- constexpr size_t kstore_alignment = 64;
- const size_t kernel_storage_per_thread = NEWinogradLayerKernel::get_kernel_storage_size(kernel_shape);
- _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_per_thread + kstore_alignment - 1) }, 1, DataType::U8));
+ constexpr size_t storage_alignment = 64;
+ const size_t kernel_storage_size = NEWinogradLayerKernel::get_weight_storage_size(out_channels, in_channels) * sizeof(float);
+ _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
_memory_group.manage(&_kernel_storage);
-
- // Get workbench size and allocate memory
- constexpr size_t wspace_alignment = 64;
- const size_t ws_size = NEWinogradLayerKernel::get_working_space_size(in_shape, kernel_shape, padding);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (ws_size + wspace_alignment - 1) }, 1, DataType::U8));
- _memory_group.manage(&_workspace);
-
- // Workspace for weights transform
- const size_t weights_transform_size = NEWinogradLayerKernel::get_kernel_transform_working_size(kernel_shape);
- _weights_workspace.allocator()->init(TensorInfo(TensorShape{ (weights_transform_size + wspace_alignment - 1) }, 1, DataType::U8));
- _memory_group.manage(&_weights_workspace);
-
+ _memory_group.manage(&_input_nhwc);
_kernel_storage.allocator()->allocate();
- _workspace.allocator()->allocate();
- _weights_workspace.allocator()->allocate();
+ // Input storage
+ const size_t input_storage_size = NEWinogradLayerKernel::get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, false) * sizeof(float);
+ _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+ _memory_group.manage(&_input_workspace);
+ _input_workspace.allocator()->allocate();
+
+ // Output storage
+ const size_t output_storage_size = NEWinogradLayerKernel::get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, false) * sizeof(float);
+ _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+ _memory_group.manage(&_output_workspace);
+ _output_workspace.allocator()->allocate();
+
+ // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
+ TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
+ _output->info()->dimension(1), _output->info()->dimension(3)),
+ 1, _output->info()->data_type());
+ _output_nhwc.allocator()->init(info);
+
+ _output_nhwc.allocator()->allocate();
+
+ // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+ switch(weights->info()->num_dimensions())
+ {
+ case 3:
+ {
+ _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+ break;
+ }
+ case 4:
+ {
+ _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not supported.");
+ break;
+ }
+ }
+
+ _weights_hwio.allocator()->allocate();
+
+ // configure the kernel to transform the input tensor from NCHW -> NHWC
+ _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+
+ _input_nhwc.allocator()->allocate();
// Create Winograd operator object
- _conv = support::cpp14::make_unique<Winograd3x3F32>(kernel_shape, in_shape, padding, _kernel_storage.buffer());
+ _conv = support::cpp14::make_unique<Winograd3x3F32>(
+ in_shape.n_batches,
+ in_shape.n_channels,
+ in_shape.n_rows,
+ in_shape.n_cols,
+ out_channels,
+ false,
+ reinterpret_cast<const float *>(_weights_hwio.buffer()),
+ reinterpret_cast<float *>(_kernel_storage.buffer()),
+ reinterpret_cast<float *>(_input_nhwc.buffer()),
+ reinterpret_cast<float *>(_input_workspace.buffer()),
+ reinterpret_cast<float *>(_output_nhwc.buffer()),
+ reinterpret_cast<float *>(_output_workspace.buffer()));
// Configure the kernel, padding not needed so it's safe to call configure after allocare
- _winograd_kernel.configure(output, _conv.get());
+ _winograd_kernel.configure(_conv.get());
+
+ // Reorder the convoluted output to ACL's ordering NCHW
+ _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
+
}
void NEWinogradLayer::run()
{
-#if defined(__aarch64__)
_memory_group.acquire();
if(!_reshaped_kernel)
{
- _conv->transform_weights(reinterpret_cast<const float *>(_weights->buffer()), reinterpret_cast<float *>(_weights_workspace.buffer()));
_reshaped_kernel = true;
+ _permute_weights.run();
+ _conv->transform_weights();
}
- const Tensor4DShape in_shape(internal_get_input_shape(_input));
- auto padding = PADDING_VALID;
-
//Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
- _conv->nchw2nhwc(in_shape, padding, _workspace.buffer(), reinterpret_cast<const float *>(_input->buffer()));
-
- //Get ptrs into the workspace
- std::pair<void *, void *> nhwc_ptrs = _conv->get_nhwc_ptrs(in_shape, padding, _workspace.buffer());
-
- //Setup matrices ptrs and transfor the input tensor to the appropriate form before running GEMM.
- _conv->reshape_input(in_shape, padding, nhwc_ptrs.second, _workspace.buffer());
-
+ _permute_input.run();
+ // Transform input tensor to the winograd domain
+ _conv->transform_input();
//Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
- NEScheduler::get().schedule(&_winograd_kernel, Window::DimY);
-
- //Transform the output to the appropriate form
- _conv->reshape_output(in_shape, padding, nhwc_ptrs.first);
-
- //Transform back to NCHW
- _conv->nhwc2nchw(in_shape, padding, _workspace.buffer(), reinterpret_cast<float *>(_output->buffer()));
-
+ NEScheduler::get().schedule(&_winograd_kernel, Window::DimX);
+ // Transform output tensor to the spatial domain
+ _conv->transform_output();
+ // Reorder the convoluted output to ACL's ordering NCHW
+ _permute_output.run();
_memory_group.release();
-#else /* __aarch64__ */
- ARM_COMPUTE_UNUSED(_winograd_kernel);
- ARM_COMPUTE_UNUSED(_workspace);
- ARM_COMPUTE_UNUSED(_kernel_storage);
- ARM_COMPUTE_UNUSED(_input);
- ARM_COMPUTE_UNUSED(_weights);
- ARM_COMPUTE_UNUSED(_output);
- ARM_COMPUTE_UNUSED(_reshaped_kernel);
- ARM_COMPUTE_UNUSED(_conv);
- ARM_COMPUTE_ERROR("Winograd only supported for aarch64, recompile with arch=arm64-v8a.");
-#endif /* __aarch64__ */
}
} // namespace arm_compute