arm_compute v18.08
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index c27ff2f..d72c98b 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,12 +61,12 @@
void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value,
bool use_fp16)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON(gradient_size < 3);
- ARM_COMPUTE_ERROR_ON(gradient_size > 7);
- ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr);
ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
+ ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7));
+ ARM_COMPUTE_ERROR_ON((lower_thr < 0) || (lower_thr >= upper_thr));
_output = output;
@@ -119,7 +119,7 @@
}
else
{
- ARM_COMPUTE_ERROR("Gradient size not supported\n");
+ ARM_COMPUTE_ERROR("Gradient size %d not supported\n", gradient_size);
}
// Manage intermediate buffers
@@ -171,24 +171,23 @@
void NECannyEdge::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
- ARM_COMPUTE_ERROR_ON(_output == nullptr);
_memory_group.acquire();
// Run sobelNxN
_sobel->run();
- // Fill border before non-maxima suppression. Nop for border mode undefined.
- NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ);
-
// Run gradient
NEScheduler::get().schedule(_gradient.get(), Window::DimY);
+ // Fill border before non-maxima suppression. Nop for border mode undefined.
+ NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ);
+
// Run non-maxima suppression
NEScheduler::get().schedule(&_non_max_suppr, Window::DimY);
ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
- memset(_output->buffer(), 0, _output->info()->total_size());
+ std::fill_n(_output->buffer(), _output->info()->total_size(), 0);
// Fill border before edge trace
NEScheduler::get().schedule(&_border_edge_trace, Window::DimZ);
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
new file mode 100644
index 0000000..21ab47d
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
+
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+NEConcatenateLayer::NEConcatenateLayer()
+ : _concat_function(nullptr)
+{
+}
+
+void NEConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output, DataLayoutDimension axis)
+{
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ switch(get_data_layout_dimension_index(output->info()->data_layout(), axis))
+ {
+ case 0:
+ {
+ auto func = support::cpp14::make_unique<NEWidthConcatenateLayer>();
+ func->configure(inputs_vector, output);
+ _concat_function = std::move(func);
+ break;
+ }
+ case 2:
+ {
+ auto func = support::cpp14::make_unique<NEDepthConcatenateLayer>();
+ func->configure(inputs_vector, output);
+ _concat_function = std::move(func);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Concatenation is supported across width and depth only!");
+ }
+}
+
+Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, DataLayoutDimension axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
+
+ switch(get_data_layout_dimension_index(output->data_layout(), axis))
+ {
+ case 0:
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector, output));
+ break;
+ case 2:
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayer::validate(inputs_vector, output));
+ break;
+ default:
+ ARM_COMPUTE_RETURN_ERROR_MSG("Concatenation is supported across width and depth only!");
+ }
+ return Status{};
+}
+
+void NEConcatenateLayer::run()
+{
+ ARM_COMPUTE_ERROR_ON(_concat_function == nullptr);
+ _concat_function->run();
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 7053c7e..931e5db 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "support/ToolchainSupport.h"
#include <cmath>
@@ -41,10 +42,11 @@
}
void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+ const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_UNUSED(num_groups);
ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
enable_fast_math));
@@ -78,8 +80,10 @@
}
Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on NEON");
+
switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info))
{
case ConvolutionMethod::WINOGRAD:
@@ -108,6 +112,42 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
ARM_COMPUTE_UNUSED(weights_info);
+ const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+
+ /* Input spatial dims, kernel size, IFM/OFM, conv info*/
+ using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
+ using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
+
+ const std::vector<ConfigurationMethod> known_configs =
+ {
+ // Alexnet
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM),
+ // VGG16 / VGG19
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM),
+ // Mobilenet 224
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM),
+ // Mobilenet 160
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM)
+ };
+
+ const auto find_config = [&](ConfigurationMethod c)
+ {
+ const ConvolutionConfiguration config = c.first;
+ const PadStrideInfo info = std::get<3>(config);
+
+ return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
+ && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
+ && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+ };
+
+ std::vector<ConfigurationMethod>::const_iterator found;
+ if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+ {
+ return (*found).second;
+ }
+
if(dilation != Size2D(1U, 1U) || Scheduler::get().cpu_info().get_cpu_model() == CPUModel::A53
|| input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
{
@@ -119,6 +159,12 @@
void NEConvolutionLayer::run()
{
+ prepare();
_function->run();
}
+
+void NEConvolutionLayer::prepare()
+{
+ _function->prepare();
+}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp
new file mode 100644
index 0000000..efa8b89
--- /dev/null
+++ b/src/runtime/NEON/functions/NECopy.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NECopy.h"
+
+#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NECopy::configure(ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NECopyKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
+
+Status NECopy::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
+{
+ return NECopyKernel::validate(input, output);
+}
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 40ada8f..fda9f57 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -38,7 +38,8 @@
_scaled_output(),
_input(nullptr),
_info(),
- _inner_border()
+ _inner_border(),
+ _is_prepared(false)
{
}
@@ -62,18 +63,15 @@
info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, bias);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, bias);
if(bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, bias);
}
if(output->tensor_shape().total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape());
@@ -104,6 +102,7 @@
_input = input;
_info = info;
_inner_border = std::make_pair(inner_border_right, inner_border_top);
+ _is_prepared = false;
const unsigned int stride_x = info.stride().first;
const unsigned int stride_y = info.stride().second;
@@ -115,8 +114,7 @@
// configure scale function
// Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor
- const TensorInfo scale_out_info(compute_deconvolution_shape(*input->info(), stride_x, stride_y, inner_border_right, inner_border_top, info), 1, input->info()->data_type(),
- input->info()->fixed_point_position());
+ const TensorInfo scale_out_info(compute_deconvolution_shape(*input->info(), stride_x, stride_y, inner_border_right, inner_border_top, info), 1, input->info()->data_type());
_scaled_output.allocator()->init(scale_out_info);
// setup the function to convolve the upscaled output
@@ -132,13 +130,21 @@
void NEDeconvolutionLayer::run()
{
+ prepare();
+
_memory_group.acquire();
- // Run upsample kernel
_upsample_f.run();
-
- // Run convolution layer
_conv_f.run();
_memory_group.release();
+}
+
+void NEDeconvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ _conv_f.prepare();
+ _is_prepared = true;
+ }
}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
index 930f8d5..49db855 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
@@ -27,7 +27,9 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "support/ToolchainSupport.h"
@@ -41,18 +43,22 @@
{
}
-void NEDepthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output) // NOLINT
+void NEDepthConcatenateLayer::configure(const std::vector<ITensor *> &inputs_vector, ITensor *output) // NOLINT
{
- ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
-
_num_inputs = inputs_vector.size();
_concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEDepthConcatenateLayerKernel[]>(_num_inputs);
_border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
- TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
+ std::vector<ITensorInfo *> inputs_vector_info;
+ for(unsigned int i = 0; i < _num_inputs; i++)
+ {
+ inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+ }
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector_info);
// Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+ auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+ ARM_COMPUTE_ERROR_THROW_ON(NEDepthConcatenateLayer::validate(inputs_vector_info, output->info()));
unsigned int depth_offset = 0;
for(unsigned int i = 0; i < _num_inputs; ++i)
@@ -67,6 +73,27 @@
output->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
}
+Status NEDepthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
+
+ // Output auto inizialitation if not yet initialized
+ TensorInfo tmp_output_info = *output->clone();
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_depth_concatenate_shape(inputs_vector);
+ auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
+
+ unsigned int depth_offset = 0;
+ for(const auto &input : inputs_vector)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayerKernel::validate(input, depth_offset, &tmp_output_info));
+ depth_offset += input->dimension(2);
+ }
+
+ return Status{};
+}
+
void NEDepthConcatenateLayer::run()
{
for(unsigned i = 0; i < _num_inputs; ++i)
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 0a977ad..24b12f4 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -36,8 +36,8 @@
using namespace arm_compute::misc::shape_calculator;
NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
- : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _input_nhwc(), _weights_hwio(), _output_nhwc(), _has_bias(false),
- _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true)
+ : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(),
+ _has_bias(false), _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true), _permute(false)
{
}
@@ -57,29 +57,31 @@
input->info()->data_layout());
_are_weights_reshaped = false;
_is_nchw = input->info()->data_layout() == DataLayout::NCHW;
-
- ARM_COMPUTE_ERROR_ON(!_is_optimized && !_is_nchw);
+ _permute = _is_optimized == _is_nchw;
if(_is_optimized)
{
if(_is_nchw)
{
// Configure the function to transform the input tensor from NCHW -> NHWC
- _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+ _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+ _permuted_input.info()->set_data_layout(DataLayout::NHWC);
// Configure the function to transform the weights tensor from IHW -> HWI
- _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
// Configure optimized depthwise
- _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, depth_multiplier, DataLayout::NHWC);
+ _dwc_kernel.configure(&_permuted_input, &_permuted_weights, &_permuted_output, conv_info, depth_multiplier, DataLayout::NHWC);
// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
- _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
+ _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
// Allocate tensors
- _input_nhwc.allocator()->allocate();
- _weights_hwio.allocator()->allocate();
- _output_nhwc.allocator()->allocate();
+ _permuted_input.allocator()->allocate();
+ _permuted_weights.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
}
else
{
@@ -88,39 +90,88 @@
}
else
{
- // Allocate the intermediate accumulator tensor in case of fixed point input
+ // Allocate the intermediate accumulator tensor in case of quantized input
if(_is_quantized)
{
- _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::S32));
+ TensorShape accum_shape = output->info()->tensor_shape();
+
+ if(!_is_nchw)
+ {
+ permute(accum_shape, PermutationVector(1U, 2U, 0U));
+ }
+
+ _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32));
_accumulator.info()->set_quantization_info(input->info()->quantization_info());
zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
}
- // Configure depthwise convolution kernel
- _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
-
- // Configure border handler
- _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
- }
-
- // Configure biases accumulation
- if(_has_bias || _is_quantized)
- {
- if(_is_quantized)
+ if(!_is_nchw)
{
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
- float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_accumulator, biases, output, output_multiplier, output_shift, output_quant_info.offset);
- _accumulator.allocator()->allocate();
+ // Configure the function to transform the weights tensor from HWI -> IHW
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+ // Configure optimized depthwise
+ _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier);
+
+ // Configure border handler
+ _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+ // Allocate tensors
+ _permuted_input.allocator()->allocate();
+ _permuted_weights.allocator()->allocate();
}
else
{
- _output_stage_kernel.configure(output, biases);
+ // Configure depthwise convolution kernel
+ _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
+
+ // Configure border handler
+ _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
}
}
+
+ // Configure biases accumulation
+ if(_is_quantized)
+ {
+ const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+ int output_multiplier, output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
+ _accumulator.allocator()->allocate();
+ }
+ else if(_has_bias)
+ {
+ _output_stage_kernel.configure((_is_nchw || _is_optimized) ? output : &_permuted_output, biases);
+ }
+
+ if(!_is_optimized && !_is_nchw)
+ {
+ // Configure the function to transform the convoluted output to NHWC
+ _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+ _permuted_output.allocator()->allocate();
+ }
+}
+
+Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+ }
+
+ return NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, output, conv_info, depth_multiplier);
}
void NEDepthwiseConvolutionLayer3x3::run()
@@ -132,32 +183,29 @@
_dwc_kernel.generate_convolver();
}
- // Permute weights in HWIO format if the optimized kernel will be executedd
- if(!_are_weights_reshaped && _is_optimized && _is_nchw)
+ // Permute weights
+ if(_permute)
{
- _are_weights_reshaped = true;
- _permute_weights.run();
+ if(!_are_weights_reshaped)
+ {
+ _are_weights_reshaped = true;
+ _permute_weights.run();
+ }
+
+ _permute_input.run();
}
// Handle input
- if(_is_optimized)
+ if(!_is_optimized)
{
- if(_is_nchw)
- {
- // Permute input to NHWC format execution
- _permute_input.run();
- }
- }
- else
- {
- // Fill border in NCHW format execution
+ // Fill border
NEScheduler::get().schedule(&_border_handler, Window::DimX);
}
// Execute depthwise convolution
NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
- // Permute output to ACL's native NCHW format in case of NHWC execution
+ // Permute output
if(_is_optimized && _is_nchw)
{
_permute_output.run();
@@ -168,27 +216,54 @@
{
NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
}
+
+ // Permute output
+ if(!_is_optimized && !_is_nchw)
+ {
+ _permute_output.run();
+ }
}
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
- : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
- _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
+ : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _permute_input(),
+ _permute_weights(), _permute_output(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _permuted_input(), _permuted_weights(), _permuted_output(), _is_prepared(false),
+ _is_quantized(false), _is_nhwc(false), _original_weights(nullptr)
{
}
void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
{
+ const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_UNUSED(channel_idx);
+
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != weights->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON((input->info()->dimension(channel_idx) * depth_multiplier) != weights->info()->dimension(channel_idx));
- const size_t weights_w = weights->info()->dimension(0);
- const size_t weights_h = weights->info()->dimension(1);
- const size_t weights_z = weights->info()->dimension(2);
+ _is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
+
+ ITensor *input_to_use = input;
+ const ITensor *weights_to_use = weights;
+ ITensor *output_to_use = output;
+
+ if(_is_nhwc)
+ {
+ _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+ input_to_use = &_permuted_input;
+
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+ weights_to_use = &_permuted_weights;
+ }
+
+ const size_t weights_w = weights_to_use->info()->dimension(0);
+ const size_t weights_h = weights_to_use->info()->dimension(1);
+ const size_t weights_z = weights_to_use->info()->dimension(2);
_is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _is_first_run = true;
- _original_weights = weights;
+ _is_prepared = false;
+ _original_weights = weights_to_use;
// Should bias be appended ?
bool append_bias = (biases != nullptr) && !_is_quantized;
@@ -200,6 +275,14 @@
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ if(_is_nhwc)
+ {
+ permute(output_shape, PermutationVector(1U, 2U, 0U));
+ _permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+ output_to_use = &_permuted_output;
+ }
+
// Output width and height
const unsigned int conv_w = output_shape.x();
const unsigned int conv_h = output_shape.y();
@@ -209,41 +292,50 @@
const size_t conv_size = conv_w * conv_h;
// Im2Col configuration
- TensorShape shape_im2col = input->info()->tensor_shape();
+ TensorShape shape_im2col = input_to_use->info()->tensor_shape();
shape_im2col.set(0, patch_size);
shape_im2col.set(1, conv_size);
shape_im2col.set(2, weights_z);
- _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+ _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
+ _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
// Weights reshape configuration
const TensorShape shape_weights_reshape(patch_size, weights_z);
- _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
- _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
+ _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape).set_data_layout(DataLayout::NCHW));
+ _weights_reshape_kernel.configure(weights_to_use, &_weights_reshaped, append_bias ? biases : nullptr);
// GEMV configuration
DataType v2mm_dt = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
- TensorShape shape_v2mm_out = input->info()->tensor_shape();
+ TensorShape shape_v2mm_out = input_to_use->info()->tensor_shape();
shape_v2mm_out.set(0, conv_size * weights_z);
shape_v2mm_out.set(1, 1);
shape_v2mm_out.set(2, 1);
- _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+ _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out).set_data_layout(DataLayout::NCHW));
_v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
_output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
- _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
+ _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output_to_use, conv_w, conv_h);
// Output staged configuration
if(_is_quantized)
{
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+ const QuantizationInfo output_quant_info = output->info()->quantization_info();
float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
int output_multiplier, output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
+ _output_stage_kernel.configure(&_output_reshaped, biases, output_to_use, output_multiplier, output_shift, output_quant_info.offset);
_output_reshaped.allocator()->allocate();
}
+ if(_is_nhwc)
+ {
+ _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+ _permuted_input.allocator()->allocate();
+ _permuted_weights.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
+ }
+
// Fill borders on inputs
PixelValue zero_in(static_cast<int32_t>(0));
PixelValue zero_w(static_cast<int32_t>(0));
@@ -260,23 +352,102 @@
// Allocate intermediate tensors
_input_reshaped.allocator()->allocate();
- _weights_reshaped.allocator()->allocate();
_v2mm_output.allocator()->allocate();
}
+Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+
+ // Clone output to use auto init
+ auto output_clone = output->clone();
+
+ const ITensorInfo *input_to_use = input;
+ const ITensorInfo *weights_to_use = weights;
+ const ITensorInfo *output_to_use = output_clone.get();
+
+ TensorShape permuted_input_shape = input->tensor_shape();
+ TensorShape permuted_weights_shape = weights->tensor_shape();
+ TensorInfo permuted_input;
+ TensorInfo permuted_weights;
+
+ if(input->data_layout() == DataLayout::NHWC)
+ {
+ permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
+ permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
+
+ permuted_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW));
+ permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW));
+
+ input_to_use = &permuted_input;
+ weights_to_use = &permuted_weights;
+ }
+
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ const bool append_bias = (biases != nullptr) && !is_quantized;
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ const size_t weights_w = weights_to_use->dimension(0);
+ const size_t weights_h = weights_to_use->dimension(1);
+ const size_t weights_z = weights_to_use->dimension(2);
+ const unsigned int conv_w = output_shape.x();
+ const unsigned int conv_h = output_shape.y();
+ const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
+ const size_t conv_size = conv_w * conv_h;
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output_clone, input->clone()->set_tensor_shape(output_shape));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+
+ TensorInfo permuted_output;
+ if(input->data_layout() == DataLayout::NHWC)
+ {
+ permute(output_shape, PermutationVector(1U, 2U, 0U));
+ permuted_output = TensorInfo(output_clone->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_layout(DataLayout::NCHW));
+ output_to_use = &permuted_output;
+ }
+
+ // Im2Col configuration
+ TensorShape shape_im2col = input_to_use->tensor_shape();
+ shape_im2col.set(0, patch_size);
+ shape_im2col.set(1, conv_size);
+ shape_im2col.set(2, weights_z);
+ TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+
+ // Weights reshape configuration
+ const TensorShape shape_weights_reshape(patch_size, weights_z);
+ TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape).set_data_layout(DataLayout::NCHW));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseWeightsReshapeKernel::validate(weights_to_use, &weights_reshaped, append_bias ? biases : nullptr));
+
+ // GEMV configuration
+ DataType v2mm_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+ TensorShape shape_v2mm_out = input_to_use->tensor_shape();
+ shape_v2mm_out.set(0, conv_size * weights_z);
+ shape_v2mm_out.set(1, 1);
+ shape_v2mm_out.set(2, 1);
+ TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out).set_data_layout(DataLayout::NCHW));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
+
+ TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_to_use->tensor_shape()));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output_to_use, conv_w, conv_h));
+
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output_to_use));
+ }
+
+ return Status{};
+}
+
void NEDepthwiseConvolutionLayer::run()
{
- // Run weights reshaping (Runs once for every configure)
- if(_is_first_run)
+ prepare();
+
+ if(_is_nhwc)
{
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
- NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
- _is_first_run = false;
-
- // Mark original weights tensor as unused
- _original_weights->mark_as_unused();
+ _permute_input.run();
}
NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
@@ -287,4 +458,30 @@
{
NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
}
+
+ if(_is_nhwc)
+ {
+ _permute_output.run();
+ }
+}
+
+void NEDepthwiseConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ if(_is_nhwc)
+ {
+ _permute_weights.run();
+ }
+
+ // Run reshape and mark original weights as unused
+ _weights_reshaped.allocator()->allocate();
+ NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+ NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
+ _original_weights->mark_as_unused();
+
+ _is_prepared = true;
+ }
}
diff --git a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
index d70a668..da2e49c 100644
--- a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,6 +45,14 @@
void NEDepthwiseSeparableConvolutionLayer::run()
{
+ prepare();
+
_depthwise_conv.run();
_pointwise_conv.run();
+}
+
+void NEDepthwiseSeparableConvolutionLayer::prepare()
+{
+ _depthwise_conv.prepare();
+ _pointwise_conv.prepare();
}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 445864c..40e40c8 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -34,7 +34,7 @@
using namespace arm_compute;
NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), _is_fixed_point(false),
+ : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
_is_activationlayer_enabled(false), _dim_split(Window::DimZ)
{
}
@@ -54,26 +54,10 @@
// Check if bias should be added in the convolution result
_has_bias = (bias != nullptr);
- // Allocate the intermediate accumulator tensor in case of fixed point input
- _is_fixed_point = is_data_type_fixed_point(input->info()->data_type());
- if(_is_fixed_point)
+ _conv_kernel.configure(input, weights, output, conv_info);
+ if(_has_bias)
{
- const DataType promoted_dt = (input->info()->data_type() == DataType::QS8) ? DataType::QS16 : DataType::QS32;
- _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, promoted_dt, output->info()->fixed_point_position()));
- _memory_group.manage(&_accumulator);
- _conv_kernel.configure(input, weights, &_accumulator, conv_info);
-
- // When no bias is provided, we need to downscale the accumulator tensor
- _output_stage_kernel.configure(&_accumulator, bias, output);
- _accumulator.allocator()->allocate();
- }
- else
- {
- _conv_kernel.configure(input, weights, output, conv_info);
- if(_has_bias)
- {
- _output_stage_kernel.configure(output, bias);
- }
+ _output_stage_kernel.configure(output, bias);
}
// Add zero padding XY
@@ -92,12 +76,7 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- DataType data_type = output->data_type();
- if(is_data_type_fixed_point(data_type))
- {
- // Promote data type in case of fixed point
- data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);
- }
+ DataType data_type = output->data_type();
TensorInfo accumulator(output->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
// Validate Convolution kernel
@@ -129,7 +108,7 @@
_memory_group.acquire();
NEScheduler::get().schedule(&_conv_kernel, _dim_split);
- if(_has_bias || _is_fixed_point)
+ if(_has_bias)
{
NEScheduler::get().schedule(&_output_stage_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index 32edf93..1814d61 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -32,6 +32,6 @@
void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
{
auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
- k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, false, true);
+ k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, false, true);
_kernel = std::move(k);
}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 958d081..f1606aa 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include <algorithm>
@@ -35,120 +36,108 @@
using namespace arm_compute;
using namespace arm_compute::misc::shape_calculator;
-NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+namespace
{
-}
-
-void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer)
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Perform validate step
- ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerReshapeWeights::validate(input->info(), output->info(), transpose_weights, is_batched_fc_layer));
-
- _transpose_weights = transpose_weights;
- _is_batched_fc_layer = is_batched_fc_layer;
-
- // Check if we need to transpose the weights
- if(_transpose_weights)
+ if(is_data_type_quantized_asymmetric(input.data_type()))
{
- if(_is_batched_fc_layer)
- {
- // Initialize the output tensor for transpose
- _transpose_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*input->info())));
- _memory_group.manage(&_transpose_output);
- _transpose_kernel.configure(input, &_transpose_output);
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo input_quantization_info(input.quantization_info().scale, -input.quantization_info().offset);
+ const QuantizationInfo weights_quantization_info(weights.quantization_info().scale, -weights.quantization_info().offset);
- // Configure transpose 1xW kernel
- _transpose1xW_kernel.configure(&_transpose_output, output);
-
- // Allocate temporary tensor used for transposing the weights
- _transpose_output.allocator()->allocate();
- }
- else
- {
- _transpose_kernel.configure(input, output);
- }
+ // Validate gemmlowp function
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info),
+ &weights.clone()->set_quantization_info(weights_quantization_info),
+ &output));
}
else
{
- if(_is_batched_fc_layer)
- {
- // Configure transpose 1xW kernel
- _transpose1xW_kernel.configure(input, output);
- }
- }
-}
-
-Status NEFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, const ITensorInfo *output, bool transpose_weights, bool is_batched_fc_layer)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!transpose_weights && !is_batched_fc_layer, "Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
-
- if(transpose_weights)
- {
- if(is_batched_fc_layer)
- {
- std::unique_ptr<ITensorInfo> use_output = output->clone();
- use_output->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*input));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NETransposeKernel::validate(input, use_output.get()));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(use_output.get(), output));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NETransposeKernel::validate(input, output));
- }
- }
- else
- {
- if(is_batched_fc_layer)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(input, output));
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)));
}
return Status{};
}
+} // namespace
-void NEFullyConnectedLayerReshapeWeights::run()
+void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
{
- _memory_group.acquire();
+ auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
- if(_transpose_weights)
- {
- NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
- }
-
- if(_is_batched_fc_layer)
- {
- NEScheduler::get().schedule(&_transpose1xW_kernel, Window::DimY);
- }
-
- _memory_group.release();
+Status NEFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return NETransposeKernel::validate(input, output);
}
NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(),
- _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _original_weights(nullptr)
+ : _memory_group(std::move(memory_manager)), _im2col_kernel(), _convert_weights(), _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), _accumulate_biases_kernel(),
+ _im2col_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false),
+ _is_fc_after_conv(false), _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
{
}
-void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights, bool are_weights_reshaped)
+void NEFullyConnectedLayer::configure_mm(const ITensor *input, const ITensor *weights, ITensor *output)
{
- // With the Fully Connected layer we can have 4 different cases:
- // 1) Convolution layer -> Fully Connected layer without batches
- // 2) Fully Connected layer -> Fully Connected layer without batches
- // 3) Convolution layer -> Fully Connected layer with batches
- // 4) Fully Connected layer -> Fully Connected layer with batches
+ if(_is_quantized)
+ {
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo input_quantization_info = input->info()->quantization_info();
+ const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
- // Expected shape before transpose and reshaping
- // Input: In x B (In and B can be multi-dimensional)
- // Weights: flat(In) x Out
- // Biases: Out
- // Output: Out x B (B can be multi-dimensional)
+ input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+ weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+
+ // Configure gemmlowp function
+ _mm_gemmlowp.configure(input, weights, output);
+
+ // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
+ input->info()->set_quantization_info(input_quantization_info);
+ weights->info()->set_quantization_info(weights_quantization_info);
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
+ }
+}
+
+void NEFullyConnectedLayer::configure_conv_fc(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+ // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+ // Initialize output tensor for im2col
+ TensorShape shape_im2col = compute_flatten_shape(input->info());
+ _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+
+ // Configure im2col kernel
+ _memory_group.manage(&_im2col_output);
+ _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, true);
+
+ // Configure matrix multiply kernel
+ configure_mm(&_im2col_output, weights, output);
+
+ // Allocate the output tensor for im2col once all the configure methods have been called
+ _im2col_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+ // Configure matrix multiply kernel
+ configure_mm(input, weights, output);
+}
+
+void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
+ FullyConnectedLayerInfo fc_info)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
// Perform validate step
@@ -156,165 +145,184 @@
weights->info(),
biases != nullptr ? biases->info() : nullptr,
output->info(),
- transpose_weights,
- are_weights_reshaped));
+ fc_info));
- const int num_batch_dimensions = std::max(0, static_cast<int>(output->info()->tensor_shape().num_dimensions()) - 1);
- const int num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
- const size_t linear_input_size = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
+ _are_weights_converted = true;
+ _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+ _is_fc_after_conv = true;
+ _accumulate_biases = false;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _original_weights = weights;
- _original_weights = weights;
- _linearize_input = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
- _are_weights_reshaped = are_weights_reshaped;
- _accumulate_biases = biases != nullptr;
- _is_batched_fc_layer = num_batch_dimensions > 0;
-
- const size_t interleave_width = 16 / input->info()->element_size();
- const ITensor *weights_to_use = weights;
-
- if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
+ // Configure gemmlowp output
+ if(_is_quantized)
{
- weights_to_use = &_reshape_weights_output;
-
- _reshape_weights_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_fully_connected_reshaped_weights_shape(weights->info(),
- transpose_weights,
- _is_batched_fc_layer, interleave_width)));
-
- // Reshape the weights
- _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+ _gemmlowp_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
}
- const ITensor *multiply_input = input;
-
- if(_linearize_input)
+ // Configure accumulate biases kernel for non quantized asymmetric types
+ if(biases != nullptr && !_is_quantized)
{
- _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input->info(), num_input_dimensions)));
+ _accumulate_biases = true;
- // Configure im2col kernel
- _memory_group.manage(&_im2col_output);
- _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true);
-
- multiply_input = &_im2col_output;
- }
-
- int m = multiply_input->info()->dimension(1);
- int k = multiply_input->info()->dimension(0);
-
- if(_is_batched_fc_layer)
- {
- _interleave4x4_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_interleaved_shape(*multiply_input->info())));
-
- // Configure interleave4x4 kernel
- _memory_group.manage(&_interleave4x4_output);
- _interleave4x4_kernel.configure(multiply_input, &_interleave4x4_output);
-
- multiply_input = &_interleave4x4_output;
- }
-
- // Configure matrix multiply kernel
- _mm_kernel.configure(multiply_input, weights_to_use, output, 1.0f, _is_batched_fc_layer, GEMMReshapeInfo(m, 0 /* no transpose */, k));
-
- if(_accumulate_biases)
- {
// Configure accumulate biases kernel
_accumulate_biases_kernel.configure(output, biases);
}
- // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
- if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
- {
- // Allocate the tensor for the weights reshaped
- _reshape_weights_output.allocator()->allocate();
- }
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
- if(_linearize_input)
- {
- _im2col_output.allocator()->allocate();
- }
+ const ITensor *weights_to_use = weights;
- if(_is_batched_fc_layer)
- {
- _interleave4x4_output.allocator()->allocate();
- }
-}
-
-Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, weights, output);
-
- const int num_batch_dimensions = std::max(0, static_cast<int>(output->tensor_shape().num_dimensions()) - 1);
- const int num_input_dimensions = input->tensor_shape().num_dimensions() - num_batch_dimensions;
- const size_t linear_input_size = input->tensor_shape().total_size_lower(num_input_dimensions);
-
- const bool linearize_input = (input->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
- const bool accumulate_biases = biases != nullptr;
- const bool is_batched_fc_layer = num_batch_dimensions > 0;
-
- ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size_upper(num_input_dimensions) != output->tensor_shape().total_size_upper(1));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-
- const size_t interleave_width = 16 / input->element_size();
- const ITensorInfo *weights_to_use = weights;
- std::unique_ptr<ITensorInfo> reshape_weights_output = input->clone();
-
- if(!are_weights_reshaped && (transpose_weights || is_batched_fc_layer))
- {
- reshape_weights_output->set_tensor_shape(compute_fully_connected_reshaped_weights_shape(weights, transpose_weights, is_batched_fc_layer, interleave_width));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayerReshapeWeights::validate(weights, reshape_weights_output.get(), transpose_weights, is_batched_fc_layer));
-
- weights_to_use = reshape_weights_output.get();
- }
-
- // Check correct shape of weights
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
if(is_batched_fc_layer)
{
- // Transpose + Transpose1xW
- ARM_COMPUTE_RETURN_ERROR_ON(weights_to_use->tensor_shape().x() != linear_input_size * interleave_width);
- ARM_COMPUTE_RETURN_ERROR_ON(weights_to_use->tensor_shape().y() != static_cast<unsigned int>(std::ceil(static_cast<float>(output->tensor_shape().x()) / interleave_width)));
+ _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+ input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1));
}
else
{
- // Transpose
- ARM_COMPUTE_RETURN_ERROR_ON(weights_to_use->tensor_shape().x() != output->tensor_shape().x());
- ARM_COMPUTE_RETURN_ERROR_ON(weights_to_use->tensor_shape().y() != linear_input_size);
+ _is_fc_after_conv = input->info()->num_dimensions() > 1;
}
- const ITensorInfo *multiply_input = input;
- std::unique_ptr<ITensorInfo> im2col_output = input->clone();
- std::unique_ptr<ITensorInfo> interleave4x4_output = input->clone();
-
- if(linearize_input)
+ // Reshape weights if needed
+ if(!_are_weights_reshaped)
{
- im2col_output->set_tensor_shape(compute_im2col_fc_shape(input, num_input_dimensions));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, im2col_output.get(), Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true));
-
- multiply_input = im2col_output.get();
+ // Reshape the weights
+ _reshape_weights_function.configure(weights, &_reshape_weights_output);
+ weights_to_use = &_reshape_weights_output;
}
- int m = multiply_input->dimension(1);
- int k = multiply_input->dimension(0);
+ // Convert weights if needed
+ if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+ {
+ // Convert weights
+ _convert_weights.configure(weights_to_use,
+ &_converted_weights_output,
+ input->info()->tensor_shape(),
+ fc_info.weights_trained_layout);
+
+ weights_to_use = &_converted_weights_output;
+ _are_weights_converted = false;
+ }
+
+ ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
+ if(_is_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer without batches
+ configure_conv_fc(input, weights_to_use, tmp_output);
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer without batches
+ configure_fc_fc(input, weights_to_use, tmp_output);
+ }
+
+ // Configure output stage for asymmetric quantized types
+ if(_is_quantized)
+ {
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
+ int output_multiplier, output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
+ _gemmlowp_output.allocator()->allocate();
+ }
+
+ _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+}
+
+Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+ FullyConnectedLayerInfo fc_info)
+{
+ ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+ bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+ bool is_fc_after_conv = true;
+ bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+
+ const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)));
+ const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+ const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
+ const ITensorInfo &gemmlowp_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+
+ // Configure accumulate biases kernel for non quantized asymmetric types
+ if(biases != nullptr && !is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+ }
+
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
+
+ const ITensorInfo *input_to_use = input;
+ const ITensorInfo *weights_to_use = weights;
+ const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output;
+
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = output->dimension(1) > 1;
if(is_batched_fc_layer)
{
- interleave4x4_output->set_tensor_shape(compute_interleaved_shape(*multiply_input));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(multiply_input, interleave4x4_output.get()));
-
- multiply_input = interleave4x4_output.get();
+ is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3,
+ input->tensor_shape().cend(),
+ output->tensor_shape().cbegin() + 1));
+ }
+ else
+ {
+ is_fc_after_conv = input->num_dimensions() > 1;
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(multiply_input, weights_to_use, output, 1.0f, is_batched_fc_layer, GEMMReshapeInfo(m, 0 /* no transpose */, k)));
-
- if(accumulate_biases)
+ if(!weights_reshaped)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->tensor_shape().x() != output->tensor_shape().x());
+ // Validate reshape weights kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+ weights_to_use = &reshaped_weights;
+ }
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+ if(is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
+ {
+ // Validate convert weights kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(weights_to_use,
+ &converted_weights,
+ input->tensor_shape(),
+ fc_info.weights_trained_layout));
+ weights_to_use = &converted_weights;
+ }
+
+ if(is_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer without batches
+ ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+
+ // Validate im2col kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, true));
+ input_to_use = &im2col_input;
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer without batches
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+ }
+ // Validate matrix multiply kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
+
+ // Validate output stage for asymmetric quantized types
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&gemmlowp_output, biases, output));
}
return Status{};
@@ -322,40 +330,94 @@
void NEFullyConnectedLayer::run()
{
- // Reshape of the weights (happens only once)
- if(!_are_weights_reshaped)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- _are_weights_reshaped = true;
- _reshape_weights_kernel.run();
-
- // Mark original weights tensor as unused
- _original_weights->mark_as_unused();
- }
+ prepare();
_memory_group.acquire();
// Linearize input if it comes from a convolutional layer
- if(_linearize_input)
+ if(_is_fc_after_conv)
{
NEScheduler::get().schedule(&_im2col_kernel, Window::DimY);
}
- // Interleave input
- if(_is_batched_fc_layer)
+ // Run matrix multiply
+ if(_is_quantized)
{
- NEScheduler::get().schedule(&_interleave4x4_kernel, Window::DimY);
+ _mm_gemmlowp.run();
+ }
+ else
+ {
+ _mm_gemm.run();
}
- // Run matrix multiply
- NEScheduler::get().schedule(&_mm_kernel, _is_batched_fc_layer ? Window::DimY : Window::DimX);
-
// Accumulate biases if provided
- if(_accumulate_biases)
+ if(_is_quantized)
{
- NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+ _gemmlowp_output_stage.run();
+ }
+ else
+ {
+ if(_accumulate_biases)
+ {
+ NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+ }
}
_memory_group.release();
}
+
+void NEFullyConnectedLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ auto release_unused = [](Tensor * w)
+ {
+ if(!w->is_used())
+ {
+ w->allocator()->free();
+ }
+ };
+
+ // Pointer to current weights
+ const ITensor *cur_weights = _original_weights;
+
+ // Reshape of the weights (happens only once)
+ if(!_are_weights_reshaped)
+ {
+ // Run reshape weights kernel and mark weights as unused
+ _reshape_weights_output.allocator()->allocate();
+ _reshape_weights_function.run();
+
+ cur_weights->mark_as_unused();
+ cur_weights = &_reshape_weights_output;
+ _are_weights_reshaped = true;
+ }
+
+ // Convert weights if needed (happens only once)
+ if(!_are_weights_converted)
+ {
+ _converted_weights_output.allocator()->allocate();
+ _convert_weights.run();
+
+ cur_weights->mark_as_unused();
+ _are_weights_converted = true;
+ }
+
+ // Release reshaped weights if unused
+ release_unused(&_reshape_weights_output);
+
+ // Prepare GEMM prepare and release unused weights
+ if(!_is_quantized)
+ {
+ _mm_gemm.prepare();
+ }
+
+ // Release converted weights if unused
+ release_unused(&_reshape_weights_output);
+ release_unused(&_converted_weights_output);
+
+ _is_prepared = true;
+ }
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 9168ed4..de51266 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -23,72 +23,56 @@
*/
#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+#include "arm_compute/core/CPP/Validate.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/AssemblyHelper.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
#include "arm_compute/runtime/TensorAllocator.h"
#include "support/ToolchainSupport.h"
#include <cmath>
+using namespace arm_compute::misc::shape_calculator;
+
namespace arm_compute
{
NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(), _B_pretransposed(),
- _run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
+ : _memory_group(memory_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr),
+ _run_vector_matrix_multiplication(false), _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
{
}
void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
- ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
- ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
- if(c != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
- ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
- ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != d->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
- ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
- }
+ ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
// Check if we need to reshape the matrix B only on the first run
+ _is_prepared = false;
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+ _original_b = b;
- const bool run_optimised = a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)
- && setup_assembly_kernel(a, b, d, alpha, beta, _reshape_b_only_on_first_run, _workspace, _B_pretransposed, _memory_group, _asm_glue);
+ bool run_optimised = c == nullptr && bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), d->info(), alpha, beta, _reshape_b_only_on_first_run));
- // Check if the first input tensor is a vector.
- // If so, all the kernels for reshaping the tensors can be skipped
- if(_run_vector_matrix_multiplication)
+ if(run_optimised)
{
- if(!run_optimised)
+ _asm_glue.configure(a, b, d, alpha, beta, _reshape_b_only_on_first_run);
+ ARM_COMPUTE_ERROR_ON(!_asm_glue.is_configured());
+ }
+ else
+ {
+ if(_run_vector_matrix_multiplication)
{
// Configure the matrix multiply kernel
_mm_kernel.configure(a, b, d, alpha, false);
}
-
- // Configure matrix addition kernel
- if(beta != 0 && c != nullptr)
- {
- _ma_kernel.configure(c, d, beta);
- _run_addition = true;
- }
- }
- else
- {
- if(!run_optimised)
+ else
{
TensorShape shape_tmp_a = a->info()->tensor_shape();
TensorShape shape_tmp_b = b->info()->tensor_shape();
@@ -100,8 +84,8 @@
shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
+ TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+ TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
_tmp_a.allocator()->init(info_a);
_tmp_b.allocator()->init(info_b);
@@ -128,42 +112,135 @@
// Allocate once the all configure methods have been called
_tmp_a.allocator()->allocate();
- _tmp_b.allocator()->allocate();
-
- // Configure matrix addition kernel
- if(beta != 0 && c != nullptr)
+ if(!_reshape_b_only_on_first_run)
{
- _ma_kernel.configure(c, d, beta);
- _run_addition = true;
+ _tmp_b.allocator()->allocate();
}
}
+
+ // Configure matrix addition kernel
+ if(beta != 0 && c != nullptr)
+ {
+ _ma_kernel.configure(c, d, beta);
+ _run_addition = true;
+ }
}
}
+Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+ if(c != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+ }
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+ if(gemm_info.depth_output_gemm3d() != 1)
+ {
+ if(gemm_info.reinterpret_input_as_3d())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ }
+ }
+
+ // Check if we need to run the optimized assembly kernel
+ const bool run_optimised = c == nullptr && bool(NEGEMMAssemblyDispatch::validate(a, b, output, alpha, beta, true));
+
+ if(!run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 1, "NEGEMM cannot reinterpret the output tensor as 3D");
+
+ // Check if the first input tensor is a vector.
+ const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+ // Check if we need to reshape the matrix A and matrix B
+ const bool run_interleave_transpose = !run_vector_matrix_multiplication && !(gemm_info.reshape_b_only_on_first_run());
+
+ // Arguments used by GEMMReshapeInfo
+ // If we pass the matrix A and matrix B reshaped to NEGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to NEGEMMReshapeInfo
+ // in order to know how the matrices have been reshaped
+ const int m = a->dimension(1);
+ const int n = b->dimension(0);
+ const int k = a->dimension(0);
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
+
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
+
+ const ITensorInfo *matrix_a_info = a;
+ const ITensorInfo *matrix_b_info = b;
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+ TensorInfo tmp_output_info = *output->clone();
+
+ if(run_interleave_transpose)
+ {
+ matrix_a_info = &tmp_a_info;
+ matrix_b_info = &tmp_b_info;
+
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
+
+ // Validate transpose kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
+ }
+
+ // Validate matrix multiply
+ auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
+ }
+
+ return Status{};
+}
+
void NEGEMM::run()
{
- _memory_group.acquire();
+ prepare();
- if(_asm_glue._optimised_kernel != nullptr)
+ if(_asm_glue.is_configured())
{
+ _memory_group.acquire();
_asm_glue.run();
_memory_group.release();
}
else
{
+ _memory_group.acquire();
+
if(!_run_vector_matrix_multiplication)
{
// Run interleave kernel
NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
- if(_is_first_run)
- {
- // Run transpose kernel
- NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
-
- _is_first_run = false;
- }
- else if(!_reshape_b_only_on_first_run)
+ if(!_reshape_b_only_on_first_run)
{
// Run transpose kernel
NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
@@ -181,4 +258,27 @@
}
}
}
+
+void NEGEMM::prepare()
+{
+ if(!_is_prepared)
+ {
+ if(_asm_glue.is_configured())
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+ _asm_glue.prepare();
+ }
+ else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue.is_configured())
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+ _tmp_b.allocator()->allocate();
+ NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+ _original_b->mark_as_unused();
+ }
+
+ _is_prepared = true;
+ }
+}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
new file mode 100644
index 0000000..29db654
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
+#include "arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+std::unique_ptr<IFunction> create_function_all_types(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+ std::shared_ptr<IMemoryManager> memory_manager)
+
+{
+ //Note: It's safe to not check for FP16 support because this was already checked in NEGEMMAssemblyDispatch::configure()
+ switch(method)
+ {
+ case arm_gemm::GemmMethod::GEMM_INTERLEAVED:
+ {
+ if(!pretranspose_hint)
+ {
+ return nullptr;
+ }
+ auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
+ function->configure(a, b, d, alpha, beta, pretranspose_hint);
+ return std::move(function);
+ }
+ default:
+ return nullptr;
+ }
+}
+
+template <typename TypeInput, typename TypeOutput>
+std::unique_ptr<IFunction> create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+ std::shared_ptr<IMemoryManager> memory_manager)
+{
+ ARM_COMPUTE_UNUSED(method);
+ ARM_COMPUTE_UNUSED(a);
+ ARM_COMPUTE_UNUSED(b);
+ ARM_COMPUTE_UNUSED(d);
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(beta);
+ ARM_COMPUTE_UNUSED(pretranspose_hint);
+ ARM_COMPUTE_UNUSED(memory_manager);
+ return nullptr;
+}
+
+#ifdef __aarch64__
+template <>
+std::unique_ptr<IFunction> create_function<int8_t, int32_t>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+ std::shared_ptr<IMemoryManager> memory_manager)
+{
+ switch(method)
+ {
+ case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT:
+ {
+ if(!pretranspose_hint)
+ {
+ return nullptr;
+ }
+ auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
+ function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */);
+ return std::move(function);
+ }
+ default:
+ return nullptr;
+ }
+ return nullptr;
+}
+
+template <>
+std::unique_ptr<IFunction> create_function<uint8_t, uint32_t>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+ std::shared_ptr<IMemoryManager> memory_manager)
+{
+ switch(method)
+ {
+ case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT:
+ {
+ if(!pretranspose_hint)
+ {
+ return nullptr;
+ }
+ auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
+ function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */);
+ return std::move(function);
+ }
+ default:
+ return nullptr;
+ }
+ return nullptr;
+}
+
+template <>
+std::unique_ptr<IFunction> create_function<float, float>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+ std::shared_ptr<IMemoryManager> memory_manager)
+{
+ ARM_COMPUTE_UNUSED(pretranspose_hint);
+ ARM_COMPUTE_UNUSED(memory_manager);
+ switch(method)
+ {
+ case arm_gemm::GemmMethod::GEMM_NATIVE:
+ {
+ auto kernel = support::cpp14::make_unique<NEGEMMNativeWrapperKernel<float, float>>();
+ kernel->configure(a, b, d, alpha, beta);
+ auto function = support::cpp14::make_unique<NESimpleAssemblyFunction>();
+ function->configure(std::move(kernel));
+ return std::move(function);
+ }
+ default:
+ return nullptr;
+ }
+}
+#endif /* __aarch64__ */
+
+/** Fallback in case ACL doesn't have a function */
+template <typename TypeInput, typename TypeOutput>
+class Fallback : public NEGEMMAssemblyDispatch::IFallback
+{
+public:
+ void configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> &args, MemoryGroup &memory_group);
+ void run() override;
+ void prepare() override;
+ bool is_configured() const override;
+
+private:
+ /** Allocate a workspace tensor.
+ *
+ * @param[in] workspace_size Size to allocate.
+ * @param[in] memory_group Tensor memory group.
+ * @param[in] alignment Workspace memory alignment.
+ */
+ void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment);
+
+ /** Assembly Gemm kernel */
+ std::unique_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
+ /** Optimised NEON kernel */
+ std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
+ /** Input A */
+ const ITensor *_a
+ {
+ nullptr
+ };
+ /** Input B */
+ const ITensor *_b
+ {
+ nullptr
+ };
+ /** Output */
+ ITensor *_d{ nullptr };
+ /** GEMM workspace */
+ Tensor _workspace{};
+ /** Pre-transpose tensor */
+ Tensor _pretranspose{};
+ /** Prepared flag */
+ bool _is_prepared{ false };
+};
+
+template <typename TypeInput, typename TypeOutput>
+void Fallback<TypeInput, TypeOutput>::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> &args, MemoryGroup &memory_group)
+{
+ _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput>(args, nullptr);
+ if(_gemm_kernel_asm == nullptr)
+ {
+ //configuration not supported: Leave function unconfigured:
+ return;
+ }
+
+ // arm_compute wrapper for the Gemm object (see above)
+ std::unique_ptr<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>> acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>>();
+ ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr);
+ acl_gemm_wrapper->configure(_gemm_kernel_asm.get());
+ const size_t workspace_size = _gemm_kernel_asm->get_working_size();
+ if(workspace_size > 0)
+ {
+ // Allocate workspace
+ const unsigned int alignment = 4096;
+ allocate_workspace(workspace_size, memory_group, alignment);
+ }
+
+ //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
+ //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
+ {
+ const int window_size = _gemm_kernel_asm->get_window_size();
+ if(window_size < args._maxthreads)
+ {
+ _gemm_kernel_asm->set_nthreads(window_size);
+ }
+ }
+
+ _optimised_kernel = std::move(acl_gemm_wrapper);
+ _a = a;
+ _b = b;
+ _d = d;
+ // Check for pre-transposed support
+ if(_gemm_kernel_asm->B_pretranspose_required())
+ {
+ // Forcing 128-byte alignment (required by 32-bit kernels)
+ const unsigned int alignment = 128;
+ const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
+ _pretranspose.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment) }, 1, DataType::S8), alignment);
+ _pretranspose.allocator()->allocate();
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_pretranspose.buffer());
+ }
+}
+
+template <typename TypeInput, typename TypeOutput>
+void Fallback<TypeInput, TypeOutput>::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Pretranspose B if required
+ if(_gemm_kernel_asm->B_pretranspose_required())
+ {
+ ARM_COMPUTE_ERROR_ON(_pretranspose.buffer() == nullptr);
+ const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+ const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
+ const int multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
+
+ _gemm_kernel_asm->pretranspose_B_array(_pretranspose.buffer(), in1_ptr, ldb, multi_stride_b);
+ _b->mark_as_unused();
+ }
+
+ _is_prepared = true;
+ }
+}
+
+template <typename TypeInput, typename TypeOutput>
+void Fallback<TypeInput, TypeOutput>::allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0");
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment) }, 1, DataType::S8), alignment);
+ memory_group.manage(&_workspace);
+ _workspace.allocator()->allocate();
+}
+
+template <typename TypeInput, typename TypeOutput>
+bool Fallback<TypeInput, TypeOutput>::is_configured() const
+{
+ return _optimised_kernel != nullptr;
+}
+
+template <typename TypeInput, typename TypeOutput>
+void Fallback<TypeInput, TypeOutput>::run()
+{
+ const int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+ int ldb = 0;
+ const int ldd = _d->info()->strides_in_bytes().y() / sizeof(TypeOutput);
+
+ // In the case of NHWC we want to interpret the output shape as 3D. Thus, the batch stride for A is
+ // the relevant multiple of the row stride.
+ const bool is_nhwc = _a->info()->data_layout() == DataLayout::NHWC;
+ const int stride_in_bytes_a = is_nhwc ? _a->info()->strides_in_bytes().y() * _d->info()->dimension(1) : _a->info()->strides_in_bytes().z();
+
+ const int batch_stride_a = stride_in_bytes_a / sizeof(TypeInput);
+ const int batch_stride_d = _d->info()->strides_in_bytes().z() / sizeof(TypeOutput);
+
+ const int multi_stride_a = _a->info()->strides_in_bytes()[3] / sizeof(TypeInput);
+ int multi_stride_b = 0;
+ const int multi_stride_d = _d->info()->strides_in_bytes()[3] / sizeof(TypeOutput);
+
+ const auto in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer() + _a->info()->offset_first_element_in_bytes());
+ const TypeInput *in1_ptr = nullptr;
+ auto out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer() + _d->info()->offset_first_element_in_bytes());
+
+ // Check if B is pre-tranposed and de-reference if not
+ if(!_gemm_kernel_asm->B_is_pretransposed())
+ {
+ ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+ multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
+ in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
+ }
+
+ // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
+ if(_workspace.buffer() != nullptr)
+ {
+ _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer()));
+ const unsigned int window_size = _gemm_kernel_asm->get_window_size();
+ unsigned int num_threads = NEScheduler::get().num_threads();
+ if(window_size < num_threads)
+ {
+ num_threads = window_size;
+ _gemm_kernel_asm->set_nthreads(num_threads);
+ }
+ }
+
+ // Prepare assembly kernel
+ prepare();
+
+ // Set gemm parameters
+ _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d);
+
+ // Schedule assembly kernel
+ NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX);
+}
+
+template <typename TypeInput, typename TypeOutput>
+void create_function_or_arm_gemm(std::unique_ptr<IFunction> &acl_function, std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group, const ITensor *a, const ITensor *b,
+ ITensor *d, float alpha, float beta, bool pretranspose_hint, std::shared_ptr<IMemoryManager> memory_manager)
+{
+ INEGEMMWrapperKernel::Params p = INEGEMMWrapperKernel::extract_parameters(a, b, d);
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ unsigned int num_threads = NEScheduler::get().num_threads();
+
+ arm_gemm::GemmArgs<TypeOutput> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+
+ //Try to create an ACL function:
+ acl_function = create_function_all_types(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager);
+ // If the type agnostic factory failed to create an ACL function, try the specialised one:
+ if(acl_function == nullptr)
+ {
+ acl_function = create_function<TypeInput, TypeOutput>(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager);
+ }
+ //If we still don't have an ACL function:
+ if(acl_function == nullptr)
+ {
+ //Fallback onto arm_gemm function if ACL doesn't support this method.
+ auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput>>();
+ fallback->configure(a, b, d, args, memory_group);
+ arm_gemm = std::move(fallback);
+ }
+}
+
+} //namespace
+
+NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager)
+ : _function(nullptr), _arm_gemm(nullptr), _memory_group(memory_manager), _memory_manager(memory_manager)
+{
+}
+
+Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, float alpha, float beta, bool pretranspose_hint)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(beta);
+ ARM_COMPUTE_UNUSED(pretranspose_hint);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
+#ifndef __aarch64__
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 || a->data_type() == DataType::S8 || a->data_type() == DataType::QASYMM8, "8bit integer types only supported for aarch64");
+#endif /* __aarch64__ */
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::U8, DataType::QASYMM8, DataType::S8, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::S32 && d->data_type() != DataType::U32, "Only U32/S32 output supported for QASYMM8 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
+ return Status{};
+}
+
+void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(b);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(d);
+
+ //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
+ if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), d->info(), alpha, beta, pretranspose_hint))
+ {
+ return;
+ }
+
+ switch(a->info()->data_type())
+ {
+ case DataType::F32:
+ create_function_or_arm_gemm<float, float>(_function, _arm_gemm, _memory_group, a, b, d, alpha, beta, pretranspose_hint, _memory_manager);
+ break;
+#ifdef __aarch64__
+ case DataType::U8:
+ case DataType::QASYMM8:
+ create_function_or_arm_gemm<uint8_t, uint32_t>(_function, _arm_gemm, _memory_group, a, b, d, alpha, beta, pretranspose_hint, _memory_manager);
+ break;
+ case DataType::S8:
+ create_function_or_arm_gemm<int8_t, int32_t>(_function, _arm_gemm, _memory_group, a, b, d, alpha, beta, pretranspose_hint, _memory_manager);
+ break;
+#endif /* __aarch64__ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ create_function_or_arm_gemm<float16_t, float16_t>(_function, _arm_gemm, _memory_group, a, b, d, alpha, beta, pretranspose_hint, _memory_manager);
+ break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ default:
+ break;
+ }
+}
+
+void NEGEMMAssemblyDispatch::prepare()
+{
+ if(_function != nullptr)
+ {
+ _function->prepare();
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+ _arm_gemm->prepare();
+ }
+}
+
+bool NEGEMMAssemblyDispatch::is_configured() const
+{
+ return (_arm_gemm != nullptr && _arm_gemm->is_configured()) || _function != nullptr;
+}
+
+void NEGEMMAssemblyDispatch::run()
+{
+ _memory_group.acquire();
+ if(_function != nullptr)
+ {
+ _function->run();
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+ _arm_gemm->run();
+ }
+ _memory_group.release();
+}
+} //namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 2888b43..92e641e 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -23,10 +23,10 @@
*/
#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
-#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "support/ToolchainSupport.h"
@@ -34,98 +34,50 @@
#include <cmath>
#include <tuple>
-namespace
-{
-arm_compute::TensorShape get_reshaped_weights_shape(const arm_compute::ITensorInfo *weights, bool append_bias)
-{
- const unsigned int mat_weights_cols = weights->dimension(3);
- const unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (append_bias ? 1 : 0);
- return arm_compute::TensorShape(mat_weights_cols, mat_weights_rows);
-}
-} // namespace
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
-namespace arm_compute
-{
-NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights()
+ : _weights_reshape_kernel()
{
}
-void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
+void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output)
{
// Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(),
(biases != nullptr) ? biases->info() : nullptr,
- output->info(),
- transpose1xW));
+ output->info()));
- // Check if bias are present, if yes they will be embedded to the weights matrix
- const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
- //const unsigned bias_element = (append_biases) ? 1 : 0;
+ const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
const ITensor *biases_to_use = (append_biases) ? biases : nullptr;
- _transpose1xW = transpose1xW;
-
- if(transpose1xW)
- {
- // Create tensor to store the reshaped weights
- TensorInfo info_wr = weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(get_reshaped_weights_shape(weights->info(), append_biases));
-
- _weights_reshaped.allocator()->init(info_wr);
- _memory_group.manage(&_weights_reshaped);
-
- _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
- _weights_transposed_kernel.configure(&_weights_reshaped, output);
-
- _weights_reshaped.allocator()->allocate();
- }
- else
- {
- _weights_reshape_kernel.configure(weights, biases_to_use, output);
- }
+ _weights_reshape_kernel.configure(weights, biases_to_use, output);
output->info()->set_quantization_info(weights->info()->quantization_info());
}
-Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose1xW)
+Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
- if(!is_data_type_quantized_asymmetric(weights->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
- }
- // Check if bias are present, if yes they will be embedded to the weights matrix
- const bool append_bias = (biases != nullptr);
- if(append_bias)
+ if(biases != nullptr)
{
+ const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type()));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
}
- // Checks performed when biases are present
- if(append_bias)
+ if((output != nullptr) && (output->total_size() != 0))
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
- if(transpose1xW)
- {
- TensorInfo weights_reshaped = weights->clone()->set_tensor_shape(get_reshaped_weights_shape(weights, append_bias));
- ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(&weights_reshaped, output));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, output));
+ NEWeightsReshapeKernel::validate(weights, biases, output);
}
return Status{};
@@ -133,110 +85,21 @@
void NEConvolutionLayerReshapeWeights::run()
{
- _memory_group.acquire();
-
NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
-
- if(_transpose1xW)
- {
- NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
- }
-
- _memory_group.release();
}
-namespace
-{
-TensorShape get_reshaped_weights_shape_conv(const ITensorInfo *weights, bool append_bias, bool is_fully_connected_convolution)
-{
- unsigned int mat_weights_cols = weights->dimension(3);
- unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (append_bias ? 1 : 0);
-
- if(is_fully_connected_convolution)
- {
- // Create tensor to store the reshaped weights
- return TensorShape(mat_weights_cols, mat_weights_rows);
- }
- else
- {
- // Create tensor to store transposed weights
- const float transpose_width = 16.0f / weights->element_size();
- return TensorShape(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
- }
-}
-
-Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const ActivationLayerInfo &act_info, DataType &dt,
- bool &append_bias, bool &skip_im2col,
- bool &are_weights_reshaped, unsigned int &kernel_width, unsigned int &kernel_height,
- bool &is_fully_connected_convolution, bool &is_interleaved, bool &is_quantized, bool &is_activationlayer_enabled,
- unsigned int &mat_weights_cols, unsigned int &mat_weights_rows,
- unsigned int &conv_w, unsigned int &conv_h, const Size2D &dilation)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-
- DataLayout data_layout = input->data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
- ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && weights->dimension(idx_channel) != input->dimension(idx_channel));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
- ARM_COMPUTE_RETURN_ERROR_ON(weights_info.are_reshaped() && is_data_type_quantized_asymmetric(input->data_type()));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(data_layout == DataLayout::NHWC && input->data_type() != DataType::F32, "NHWC is only supported for FP32 data type.");
-
- dt = input->data_type();
- is_quantized = is_data_type_quantized_asymmetric(dt);
-
- if(biases != nullptr)
- {
- if(is_quantized)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
- ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && biases->dimension(0) != weights->dimension(3));
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- // If we have 1x1 convolution and data layout is NHWC we can disable im2col
- append_bias = (biases != nullptr) && (!is_quantized);
- are_weights_reshaped = weights_info.are_reshaped();
- kernel_width = (are_weights_reshaped) ? weights_info.kernel_size().first : weights->dimension(idx_width);
- kernel_height = (are_weights_reshaped) ? weights_info.kernel_size().second : weights->dimension(idx_height);
- mat_weights_cols = weights->dimension(3);
- mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel) + ((append_bias && !skip_im2col) ? 1 : 0);
- skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1);
-
- std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width), input->dimension(idx_height), kernel_width, kernel_height,
- conv_info, dilation);
-
- // Check if its a "fully connected" convolution
- is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
- is_interleaved = (!is_fully_connected_convolution && !is_quantized);
- is_activationlayer_enabled = act_info.enabled();
-
- return Status{};
-}
-} // namespace
-
NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
- : _asm_glue(), _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
- _output_col2im_kernel(), _activationlayer_function(), _add_bias_kernel(), _original_weights(nullptr), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(),
- _tmp_output(), _workspace(), _B_pretransposed(), _data_layout(DataLayout::NCHW), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false),
- _is_interleaved(false), _is_activationlayer_enabled(false), _skip_im2col(false)
+ : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
+ _add_bias_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false),
+ _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
{
}
-void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, ITensor *output, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
+void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, ITensor *output, int gemm_3d_depth)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info(), gemm_3d_depth, _skip_im2col));
+
if(_is_quantized)
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -255,128 +118,145 @@
}
else
{
- _mm_kernel.configure(input, weights, output, 1.f, is_interleaved, reshape_info);
+ // Configure matrix multiply function
+ _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, gemm_3d_depth,
+ _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */));
}
}
-void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info)
+Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int gemm_3d_depth, bool skip_im2col)
{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- DataType dt{};
- unsigned int kernel_width = 0;
- unsigned int kernel_height = 0;
- unsigned int mat_weights_cols = 0;
- unsigned int mat_weights_rows = 0;
- unsigned int conv_w = 0;
- unsigned int conv_h = 0;
-
- _data_layout = input->info()->data_layout();
- const bool is_nhwc = _data_layout == DataLayout::NHWC;
- const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-
- Status status = validate_and_initialize_values(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), conv_info, weights_info, act_info, dt, _append_bias, _skip_im2col,
- _are_weights_reshaped,
- kernel_width, kernel_height,
- _is_fully_connected_convolution, _is_interleaved, _is_quantized, _is_activationlayer_enabled,
- mat_weights_cols, mat_weights_rows, conv_w, conv_h, dilation);
-
- ARM_COMPUTE_ERROR_THROW_ON(status);
-
- _original_weights = weights;
- const unsigned int fixed_point_position = input->info()->fixed_point_position();
- const ITensor *biases_to_use = (_append_bias) ? biases : nullptr;
-
- bool run_optimised = dt == DataType::F32;
-
- // Reshape weights if needed
- if(run_optimised)
+ const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, skip_im2col);
+ if(is_quantized)
{
- TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo input_quantization_info = input->quantization_info();
+ const QuantizationInfo weights_quantization_info = weights->quantization_info();
- // Create tensor to store the reshaped weights
- _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
- _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
- weights = &_weights_reshaped;
+ std::unique_ptr<ITensorInfo> input_qa = input->clone();
+ std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+ input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+ weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+
+ // Perform validation step on GEMMLowp
+ return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), output, gemm_info);
}
else
{
- if(_are_weights_reshaped)
- {
- if(_is_fully_connected_convolution || _is_quantized)
- {
- mat_weights_cols = weights_info.num_kernels();
- mat_weights_rows = weights->info()->dimension(idx_height);
- }
- else
- {
- mat_weights_cols = weights_info.num_kernels();
- mat_weights_rows = weights_info.kernel_size().first * weights_info.kernel_size().second * input->info()->dimension(idx_channel) + (_append_bias ? 1 : 0);
- }
- }
- else
- {
- TensorShape reshaped_weights_shape;
+ // Perform validation step on Matrix multiply function
+ return NEGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+ }
+}
- if(_is_fully_connected_convolution || _is_quantized)
- {
- reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
- }
- else
- {
- // Create tensor to store transposed weights
- const float transpose_width = 16.0f / input->info()->element_size();
- reshaped_weights_shape = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
- static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
- }
+Status NEGEMMConvolutionLayer::validate_gemm3d(DataType data_type, int gemm_3d_depth, bool skip_im2col)
+{
+ const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+ const DataType output_gemm_data_type = is_quantized ? DataType::S32 : data_type;
+ const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth;
+ const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U;
- // Create tensor to store the reshaped weights
- _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
- _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, _is_interleaved /* 1xW transpose */);
- weights = &_weights_reshaped;
+ // Set dummy tensor shapes for the validation
+ const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type);
+ const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type);
+ const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, output_gemm_data_type);
+
+ return validate_mm(&dummy_input_info, &dummy_weights_info, &dummy_output_info, gemm_3d_depth, skip_im2col);
+}
+
+void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_UNUSED(num_groups);
+ ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConvolutionLayer::validate(input->info(),
+ weights->info(),
+ biases != nullptr ? biases->info() : nullptr,
+ output->info(),
+ conv_info,
+ weights_info,
+ dilation,
+ act_info,
+ num_groups));
+
+ const DataType data_type = input->info()->data_type();
+ const DataLayout data_layout = input->info()->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+ const unsigned int kernel_width = weights->info()->dimension(idx_width);
+ const unsigned int kernel_height = weights->info()->dimension(idx_height);
+
+ _is_prepared = weights_info.retain_internal_weights();
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _data_layout = data_layout;
+ _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+ _skip_col2im = data_layout == DataLayout::NHWC;
+ _append_bias = (biases != nullptr) && (!_is_quantized);
+
+ const ITensor *gemm_input_to_use = input;
+ ITensor *gemm_output_to_use = output;
+ ITensor *gemm_output_staged_to_use = output;
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(idx_width),
+ input->info()->dimension(idx_height),
+ kernel_width,
+ kernel_height,
+ conv_info,
+ dilation);
+
+ // Check if GEMM3D is supported
+ if(_skip_col2im)
+ {
+ // If not supported, we need to perform im2col and col2im (or reshape layer)
+ if(!bool(validate_gemm3d(input->info()->data_type(), conv_h, _skip_im2col)))
+ {
+ _skip_im2col = false;
+ _skip_col2im = false;
}
}
- // In case we skip im2col we have to add bias
+ const unsigned bias_element = (_append_bias && !_skip_im2col) ? 1 : 0;
+ const ITensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr;
+
+ // Get parameters from conv_info
+ unsigned int stride_x = 0;
+ unsigned int stride_y = 0;
+ std::tie(stride_x, stride_y) = conv_info.stride();
+
+ unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels);
+ unsigned int mat_weights_rows = weights->info()->dimension(idx_width) * weights->info()->dimension(idx_height) * weights->info()->dimension(idx_channel) + bias_element;
+
+ // _weights_reshaped will be auto configured in the kernel.
+ // Just append biases and do not transpose 1xW as it will be reshaped in NEGEMM
+ _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped);
+
+ // Create tensor to store im2col reshaped inputs
if(!_skip_im2col)
{
- const unsigned int mat_input_cols = mat_weights_rows;
- const unsigned int mat_input_rows = conv_w * conv_h;
-
- // Create tensor to store im2col reshaped inputs
- TensorShape shape_im2col(input->info()->tensor_shape());
- shape_im2col.set(0, mat_input_cols);
- shape_im2col.set(1, mat_input_rows);
+ // Calculate im2col shape
+ // For NEON the batch size is on the fourth dimension
+ TensorShape shape_im2col = input->info()->tensor_shape();
+ shape_im2col.set(0, mat_weights_rows);
+ shape_im2col.set(1, conv_w * conv_h);
shape_im2col.set(2, 1);
- _input_im2col_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- _memory_group.manage(&_input_im2col_reshaped);
- // Create tensor (interleave) to prepare input tensor for GEMM
- if(!_is_fully_connected_convolution && !run_optimised && _is_interleaved)
- {
- TensorShape shape_interleaved(shape_im2col);
- shape_interleaved.set(idx_width, shape_interleaved.x() * 4);
- shape_interleaved.set(idx_height, std::ceil(shape_interleaved[idx_height] / 4.f));
- _input_interleaved_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_interleaved));
- _memory_group.manage(&_input_interleaved_reshaped);
- }
+ _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+ _memory_group.manage(&_im2col_output);
- // Create GEMM output tensor
- TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, mat_input_rows);
- const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
- // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
- TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
- info_gemm.set_quantization_info(output->info()->quantization_info());
- _gemm_output.allocator()->init(info_gemm);
+ // Configure
+ _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation);
- // Configure im2col
- _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias, false, false, dilation);
+ // Update GEMM input
+ gemm_input_to_use = &_im2col_output;
}
else if(_append_bias)
{
@@ -384,129 +264,187 @@
_add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
}
- // Configure matrix multiply
- if(run_optimised)
+ // Create temporary GEMM output tensor in case we cannot skip col2im
+ if(!_skip_col2im)
{
- if(!setup_assembly_kernel(_skip_im2col ? input : &_input_im2col_reshaped, weights, is_nhwc ? output : &_gemm_output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue))
- {
- ARM_COMPUTE_ERROR("setup_assembly_kernel failed.");
- }
- }
- else
- {
- if(_is_interleaved)
- {
- // Configure GEMMInterleave4x4. _input_interleaved_reshaped will be auto configured in the kernel
- _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+ // Calculate GEMM output shape
+ TensorShape shape_gemm = _im2col_output.info()->tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, conv_w * conv_h);
- // Configure GEMM
- configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output, _is_interleaved, GEMMReshapeInfo(_input_im2col_reshaped.info()->dimension(idx_height), 0 /* no transpose */,
- _input_im2col_reshaped.info()->dimension(idx_width)));
- _input_interleaved_reshaped.allocator()->allocate();
- }
- else
- {
- configure_mm(&_input_im2col_reshaped, weights, &_gemm_output, _is_interleaved);
- }
+ // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+ const DataType gemm_data_type = _is_quantized ? DataType::S32 : data_type;
+ TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
+ info_gemm.set_quantization_info(output->info()->quantization_info());
+ _gemm_output.allocator()->init(info_gemm);
+ _memory_group.manage(&_gemm_output);
+
+ // Update GEMM output
+ gemm_output_to_use = &_gemm_output;
}
+ // Configure GEMM
+ configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, _skip_col2im ? conv_h : 1);
+
if(!_skip_im2col)
{
- _input_im2col_reshaped.allocator()->allocate();
+ _im2col_output.allocator()->allocate();
+ }
- // Configure output stage for quantized case
- if(_is_quantized)
+ // Configure output stage for quantized case
+ if(_is_quantized)
+ {
+ const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+ int output_multiplier, output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+ _memory_group.manage(&_tmp_output);
+ gemm_output_staged_to_use = &_tmp_output;
+
+ _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset);
+ }
+
+ if(!_skip_col2im)
+ {
+ if(_data_layout == DataLayout::NCHW)
{
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
-
- float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _memory_group.manage(&_tmp_output);
- _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output_quant_info.offset);
+ // Configure col2im
+ _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, Size2D(conv_w, conv_h));
}
-
- // Configure Col2Im
- if(!is_nhwc)
+ else
{
- _output_col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, Size2D(conv_w, conv_h));
+ // Configure reshape layer
+ _reshape_layer.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output);
}
+ }
- if(_is_quantized)
- {
- _tmp_output.allocator()->allocate();
- }
+ if(_is_quantized)
+ {
+ _tmp_output.allocator()->allocate();
+ }
+
+ if(!_skip_col2im)
+ {
_gemm_output.allocator()->allocate();
}
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), "Output shape does not match the expected one");
-
- // Allocate intermediate tensor
- if(!_are_weights_reshaped)
- {
- _weights_reshaped.allocator()->allocate();
- }
+ ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
+ "Output shape does not match the expected one");
//Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+
if(_is_activationlayer_enabled)
{
_activationlayer_function.configure(output, nullptr, act_info);
}
+
+ ARM_COMPUTE_UNUSED(weights_info);
}
Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
{
- ARM_COMPUTE_UNUSED(output);
-
- DataType dt{};
- bool append_bias{};
- bool skip_im2col{};
- bool are_weights_reshaped{};
- bool is_fully_connected_convolution{};
- bool is_interleaved{};
- bool is_quantized{};
- bool is_activationlayer_enabled{};
- unsigned int kernel_width = 0;
- unsigned int kernel_height = 0;
- unsigned int mat_weights_cols = 0;
- unsigned int mat_weights_rows = 0;
- unsigned int conv_w = 0;
- unsigned int conv_h = 0;
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON");
const DataLayout data_layout = input->data_layout();
- const bool is_nhwc = data_layout == DataLayout::NHWC;
+ const DataType data_type = input->data_type();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
- Status status = validate_and_initialize_values(input, weights, biases, conv_info, weights_info, act_info, dt, append_bias, skip_im2col, are_weights_reshaped, kernel_width, kernel_height,
- is_fully_connected_convolution, is_interleaved, is_quantized, is_activationlayer_enabled, mat_weights_cols, mat_weights_rows,
- conv_w, conv_h, dilation);
+ const unsigned int kernel_width = weights->dimension(idx_width);
+ const unsigned int kernel_height = weights->dimension(idx_height);
- const Size2D kernel_weights = Size2D(kernel_width, kernel_height);
+ TensorInfo im2col_reshaped_info, info_gemm, tmp_info, weights_reshaped_info;
+ const ITensorInfo *gemm_input_to_use = input;
+ const ITensorInfo *gemm_output_to_use = output;
+ const ITensorInfo *gemm_output_staged_to_use = output;
+ const ITensorInfo *weights_to_use = weights;
- ARM_COMPUTE_RETURN_ON_ERROR(status);
+ const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+ const bool append_bias = (biases != nullptr) && (!is_quantized);
+ bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+ bool skip_col2im = data_layout == DataLayout::NHWC;
- std::unique_ptr<ITensorInfo> reshaped_weights = weights->clone();
- bool optimised_kernel = false;
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
- if(dt == DataType::F32)
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width),
+ input->dimension(idx_height),
+ kernel_width,
+ kernel_height,
+ conv_info,
+ dilation);
+
+ // Check if GEMM3D is supported
+ if(skip_col2im)
{
- optimised_kernel = true;
+ // If not supported, we need to perform im2col and col2im (or reshape layer)
+ if(!bool(validate_gemm3d(input->data_type(), conv_h, skip_im2col)))
+ {
+ skip_im2col = false;
+ skip_col2im = false;
+ }
}
- const unsigned int mat_input_cols = mat_weights_rows;
- const unsigned int mat_input_rows = conv_w * conv_h;
- TensorShape shape_im2col = input->tensor_shape();
- shape_im2col.set(0, mat_input_cols);
- shape_im2col.set(1, mat_input_rows);
- shape_im2col.set(2, 1);
- TensorInfo im2_col_info = input->clone()->set_tensor_shape(shape_im2col);
+ const unsigned bias_element = (append_bias && !skip_im2col) ? 1 : 0;
+ const ITensorInfo *biases_to_use = (append_bias && !skip_im2col) ? biases : nullptr;
+
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+ // Validate biases
+ if(biases != nullptr)
+ {
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
+ }
+
+ unsigned int mat_weights_cols = weights->dimension(idx_kernels);
+ unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel) + bias_element;
+
+ // Output tensor auto inizialization if not yet initialized
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases_to_use, nullptr));
+ weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, (append_bias && !skip_im2col)), 1, data_type);
+ weights_to_use = &weights_reshaped_info;
if(!skip_im2col)
{
- // Validate im2col
- ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2_col_info, kernel_weights, conv_info, append_bias, false, false, dilation));
+ // Create tensor info for im2col reshaped inputs
+ // For NEON the batch size is on the fourth dimension
+ TensorShape shape_im2col = input->tensor_shape();
+ shape_im2col.set(0, mat_weights_rows);
+ shape_im2col.set(1, conv_w * conv_h);
+ shape_im2col.set(2, 1);
+
+ im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
+ im2col_reshaped_info.set_quantization_info(input->quantization_info());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
+ gemm_input_to_use = &im2col_reshaped_info;
}
else if(append_bias)
{
@@ -514,66 +452,45 @@
ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
}
- // Create GEMM output tensor
- TensorShape shape_gemm(im2_col_info.tensor_shape());
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, mat_input_rows);
- TensorInfo gemm_output_info = input->clone()->set_tensor_shape(shape_gemm);
-
- // Reshape weights if needed
- if(optimised_kernel)
+ // Create temporary GEMM output tensor in case we cannot skip col2im
+ if(!skip_col2im)
{
- ARM_COMPUTE_RETURN_ERROR_ON(are_weights_reshaped);
+ TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, conv_w * conv_h);
+ const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type;
+ // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+ info_gemm = TensorInfo(shape_gemm, 1, gemm_data_type);
+ info_gemm.set_quantization_info(output->quantization_info());
- // Create tensor to store the reshaped weights
- reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
- }
- else if(!is_quantized)
- {
- TensorShape reshaped_weights_shape;
-
- if(is_fully_connected_convolution || is_quantized)
- {
- reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
- }
- else
- {
- // Create tensor to store transposed weights
- const float transpose_width = 16.0f / input->element_size();
- reshaped_weights_shape = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
- static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
- }
-
- // Create tensor to store the reshaped weights
- reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
- weights = reshaped_weights.get();
-
- // Validate GEMM interleave and multiply
- if(is_interleaved)
- {
- TensorShape shape_interleaved = shape_im2col;
- shape_interleaved.set(idx_width, shape_interleaved.x() * 4);
- shape_interleaved.set(idx_height, std::ceil(shape_interleaved.y() / 4.f));
- TensorInfo input_interleaved_info = input->clone()->set_tensor_shape(shape_interleaved);
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(&im2_col_info, &input_interleaved_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&input_interleaved_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo(shape_im2col[1], // m
- weights->tensor_shape()[0], // n
- shape_im2col[0]) /* k */));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&im2_col_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
- }
- }
- if(!is_nhwc)
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
+ gemm_output_to_use = &info_gemm;
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(idx_width) != conv_w) || (output->dimension(idx_height) != conv_h), "Output shape does not match the expected one");
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, skip_col2im ? conv_h : 1, skip_im2col));
+ if(is_quantized)
+ {
+ float multiplier = input->quantization_info().scale * weights_to_use->quantization_info().scale / output->quantization_info().scale;
+ int output_multiplier, output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+ tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8);
+ tmp_info.set_quantization_info(output->quantization_info());
+ gemm_output_staged_to_use = &tmp_info;
+
+ // Validate output stage for quantized case
+ NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, output->quantization_info().offset);
+ }
+
+ // Validate Col2Im/ReshapeLayer
+ if(!skip_col2im && (data_layout == DataLayout::NCHW))
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(is_quantized ? gemm_output_staged_to_use : gemm_output_to_use,
+ output,
+ Size2D(conv_w, conv_h)));
+ }
+
+ //Validate Activation Layer
if(act_info.enabled())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
@@ -584,54 +501,30 @@
void NEGEMMConvolutionLayer::run()
{
- // Run weights reshaping (Runs once for every configure)
- if(!_are_weights_reshaped)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- _are_weights_reshaped = true;
- _reshape_weights.run();
-
- // Mark original weights tensor as unused
- _original_weights->mark_as_unused();
- }
+ prepare();
_memory_group.acquire();
if(!_skip_im2col)
{
// Run input reshaping
- unsigned int _y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- NEScheduler::get().schedule(&_input_im2col_kernel, _y_dim);
+ unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ NEScheduler::get().schedule(&_im2col_kernel, y_dim);
}
- // Runs matrix multiply on reshaped matrices
- if(_asm_glue._optimised_kernel != nullptr)
+ // Runs NEGEMM or NEGEMMLowpMatrixMultiplyCore functions
+ if(_is_quantized)
{
- _asm_glue.run();
- // Release weights in case buffer is pretransposed
- if(!_weights_reshaped.is_used())
- {
- _weights_reshaped.allocator()->free();
- }
+ // Run gemmlowp
+ _mm_gemmlowp.run();
+
+ // Run output stage
+ _gemmlowp_output_stage.run();
}
else
{
- if(_is_interleaved)
- {
- // Run interleave
- NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
- }
-
- // Runs matrix multiply on reshaped matrices
- if(_is_quantized)
- {
- _mm_gemmlowp.run();
- }
- else
- {
- NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
- }
+ // Run gemm
+ _mm_gemm.run();
}
if(_skip_im2col && _append_bias)
@@ -639,16 +532,17 @@
NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY);
}
- // Run output stage for quantized case
- if(_is_quantized)
- {
- _gemmlowp_output_stage.run();
- }
-
// Reshape output matrix
- if(_data_layout == DataLayout::NCHW)
+ if(!_skip_col2im)
{
- NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+ if(_data_layout == DataLayout::NCHW)
+ {
+ NEScheduler::get().schedule(&_col2im_kernel, Window::DimY);
+ }
+ else
+ {
+ _reshape_layer.run();
+ }
}
if(_is_activationlayer_enabled)
@@ -658,4 +552,25 @@
_memory_group.release();
}
-} // namespace arm_compute
+
+void NEGEMMConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights reshaping and mark original weights tensor as unused
+ _weights_reshaped.allocator()->allocate();
+ _reshape_weights.run();
+ _original_weights->mark_as_unused();
+
+ // Prepare GEMM
+ _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
+ if(!_weights_reshaped.is_used())
+ {
+ _weights_reshaped.allocator()->free();
+ }
+
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 98b4767..47c3358 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -38,8 +38,7 @@
using namespace arm_compute;
NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(),
- _workspace(), _B_pretransposed()
+ : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b()
{
}
@@ -53,18 +52,14 @@
ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
bool run_optimised = false;
-#ifdef __aarch64__
switch(a->info()->data_type())
{
case DataType::S8:
- {
- run_optimised = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue_signed);
- break;
- }
case DataType::QASYMM8:
case DataType::U8:
{
- run_optimised = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue_unsigned);
+ _asm_glue.configure(a, b, output, 1.f, 0.f, true);
+ run_optimised = _asm_glue.is_configured();
break;
}
default:
@@ -73,7 +68,6 @@
break;
}
}
-#endif /* __aarch64__ */
if(!run_optimised)
{
// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
@@ -133,13 +127,9 @@
NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
}
- if(_asm_glue_unsigned._optimised_kernel != nullptr)
+ if(_asm_glue.is_configured())
{
- _asm_glue_unsigned.run();
- }
- else if(_asm_glue_signed._optimised_kernel != nullptr)
- {
- _asm_glue_signed.run();
+ _asm_glue.run();
}
else
{
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 2e06fa2..828011d 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -41,9 +41,9 @@
using namespace arm_compute::misc::shape_calculator;
NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
- _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _a_offset(0), _b_offset(0),
- _run_vector_matrix_multiplication(false), _dot_product_path(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
+ : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
+ _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false),
+ _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
{
}
@@ -52,23 +52,27 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
+ // Clear state
+ _mtx_a_reshape_kernel = nullptr;
+ _mtx_b_reshape_kernel = nullptr;
+
+ // Set internal variables
_a_offset = a->info()->quantization_info().offset;
_b_offset = b->info()->quantization_info().offset;
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ _is_prepared = false;
+ _original_b = b;
#ifdef __aarch64__
switch(a->info()->data_type())
{
- case DataType::S8:
- {
- _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed);
- break;
- }
case DataType::QASYMM8:
case DataType::U8:
+ case DataType::S8:
{
- _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned);
+ _asm_glue.configure(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run);
+ _dot_product_path = _asm_glue.is_configured();
break;
}
default:
@@ -160,10 +164,13 @@
if(!_dot_product_path && !_run_vector_matrix_multiplication)
{
_tmp_a.allocator()->allocate();
- _tmp_b.allocator()->allocate();
+ if(!_reshape_b_only_on_first_run)
+ {
+ _tmp_b.allocator()->allocate();
+ }
}
- if(_a_offset != 0)
+ if(_a_offset != 0 && !_reshape_b_only_on_first_run)
{
_vector_sum_col.allocator()->allocate();
}
@@ -188,6 +195,8 @@
ARM_COMPUTE_UNUSED(gemm_info);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 1, "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the output tensor as 3D");
int32_t a_offset = a->quantization_info().offset;
int32_t b_offset = b->quantization_info().offset;
@@ -248,29 +257,24 @@
void NEGEMMLowpMatrixMultiplyCore::run()
{
+ prepare();
+
_memory_group.acquire();
- // Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction
- if(!_run_vector_matrix_multiplication && !_dot_product_path)
+ // Reshape inputs
+ if(_mtx_a_reshape_kernel)
{
- if(_mtx_a_reshape_kernel)
- {
- NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
- }
-
- if(_mtx_b_reshape_kernel && (_is_first_run || !_reshape_b_only_on_first_run))
- {
- NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
- }
+ NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+ }
+ if(_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
+ {
+ NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
}
- if(_asm_glue_unsigned._optimised_kernel != nullptr)
+ // Run GEMM
+ if(_asm_glue.is_configured())
{
- _asm_glue_unsigned.run();
- }
- else if(_asm_glue_signed._optimised_kernel != nullptr)
- {
- _asm_glue_signed.run();
+ _asm_glue.run();
}
else
{
@@ -284,7 +288,7 @@
}
// Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0 && (_is_first_run || !_reshape_b_only_on_first_run))
+ if(_a_offset != 0 && !_reshape_b_only_on_first_run)
{
NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
}
@@ -293,6 +297,38 @@
NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
_memory_group.release();
+}
- _is_first_run = false;
+void NEGEMMLowpMatrixMultiplyCore::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Run assembly reshape
+ if(_asm_glue.is_configured() && _reshape_b_only_on_first_run)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+ _asm_glue.prepare();
+ _original_b->mark_as_unused();
+ }
+ // Run non-assembly reshape
+ else if(_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+ // Run reshape kernel and mark original weights tensor as unused
+ _tmp_b.allocator()->allocate();
+ NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+ _original_b->mark_as_unused();
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0 && _reshape_b_only_on_first_run)
+ {
+ _vector_sum_col.allocator()->allocate();
+ NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+ }
+
+ _is_prepared = true;
+ }
}
diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index 6b95cb0..4245b65 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp
@@ -34,16 +34,18 @@
{
}
-void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups,
+ bool is_fully_connected, bool is_flatten)
{
_y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
- _kernel.configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+ _kernel.configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten);
}
-Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
+Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
+ unsigned int num_groups, bool is_fully_connected, bool is_flatten)
{
- return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+ return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten);
}
void NEIm2Col::run()
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 913acf8..80a2541 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -73,7 +73,7 @@
NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
- _is_first_run(false), _original_weights(nullptr)
+ _is_prepared(false), _original_weights(nullptr)
{
}
@@ -113,7 +113,7 @@
TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
- ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias, false));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
ARM_COMPUTE_RETURN_ON_ERROR(NELocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
@@ -127,7 +127,7 @@
ARM_COMPUTE_ERROR_THROW_ON(NELocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
bool _has_bias = (biases != nullptr);
- _is_first_run = true;
+ _is_prepared = false;
_original_weights = weights;
const unsigned int kernel_width = weights->info()->dimension(0);
@@ -160,24 +160,13 @@
_output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
// Allocate intermediate tensors
- _weights_reshaped.allocator()->allocate();
_input_im2col_reshaped.allocator()->allocate();
_gemm_output.allocator()->allocate();
}
void NELocallyConnectedLayer::run()
{
- // Run weights reshaping (Runs once for every configure)
- if(_is_first_run)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- _is_first_run = false;
- NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
-
- // Mark original weights tensor as unused
- _original_weights->mark_as_unused();
- }
+ prepare();
_memory_group.acquire();
@@ -192,3 +181,18 @@
_memory_group.release();
}
+
+void NELocallyConnectedLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights reshaping and mark original weights tensor as unused
+ _weights_reshaped.allocator()->allocate();
+ NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+ _original_weights->mark_as_unused();
+
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
index f865054..2738201 100644
--- a/src/runtime/NEON/functions/NEMagnitude.cpp
+++ b/src/runtime/NEON/functions/NEMagnitude.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,36 +31,18 @@
using namespace arm_compute;
-void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, MagnitudeType mag_type, bool use_fp16)
+void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, MagnitudeType mag_type)
{
- if(use_fp16)
+ if(mag_type == MagnitudeType::L1NORM)
{
- if(mag_type == MagnitudeType::L1NORM)
- {
- auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::SIGNED>>();
- k->configure(input1, input2, output, nullptr);
- _kernel = std::move(k);
- }
- else
- {
- auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
- k->configure(input1, input2, output, nullptr);
- _kernel = std::move(k);
- }
+ auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>>();
+ k->configure(input1, input2, output, nullptr);
+ _kernel = std::move(k);
}
else
{
- if(mag_type == MagnitudeType::L1NORM)
- {
- auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>>();
- k->configure(input1, input2, output, nullptr);
- _kernel = std::move(k);
- }
- else
- {
- auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
- k->configure(input1, input2, output, nullptr);
- _kernel = std::move(k);
- }
+ auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+ k->configure(input1, input2, output, nullptr);
+ _kernel = std::move(k);
}
}
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index af98ac1..f00114f 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,7 +41,7 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+ TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type());
_input_squared.allocator()->init(tensor_info);
// Manage intermediate buffers
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
new file mode 100644
index 0000000..995d5ee
--- /dev/null
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NERNNLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
+ _is_prepared(false)
+{
+}
+
+Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
+ const ITensorInfo *output, const ActivationLayerInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+
+ const int idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != recurrent_weights->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != recurrent_weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
+
+ auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info));
+
+ return Status{};
+}
+
+void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output,
+ ActivationLayerInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+
+ const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+
+ _is_prepared = false;
+
+ // Manage intermediate buffers and configure
+ _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+ _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+ // Manage intermediate buffers and configure
+ _memory_group.manage(&_fully_connected_out);
+ _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+
+ _memory_group.manage(&_gemm_output);
+ _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+
+ _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+ _memory_group.manage(&_add_output);
+
+ _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+
+ _fully_connected_out.allocator()->allocate();
+ _gemm_output.allocator()->allocate();
+
+ _activation_kernel.configure(&_add_output, hidden_state, info);
+ _add_output.allocator()->allocate();
+
+ _copy_kernel.configure(hidden_state, output);
+}
+
+void NERNNLayer::run()
+{
+ prepare();
+
+ _memory_group.acquire();
+
+ _fully_connected_kernel.run();
+
+ _gemm_state_f.run();
+
+ NEScheduler::get().schedule(&_add_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_activation_kernel, Window::DimY);
+
+ // copy hidden out to output
+ NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+
+ _memory_group.release();
+}
+
+void NERNNLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ _fully_connected_kernel.prepare();
+ _gemm_state_f.prepare();
+
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
new file mode 100644
index 0000000..a4b0dff
--- /dev/null
+++ b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NESimpleAssemblyFunction::NESimpleAssemblyFunction() // NOLINT
+ : _kernel()
+{
+}
+
+void NESimpleAssemblyFunction::run()
+{
+ NEScheduler::get().schedule(_kernel.get(), Window::DimX);
+}
+
+void NESimpleAssemblyFunction::configure(std::unique_ptr<INEGEMMWrapperKernel> kernel)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(kernel.get());
+ _kernel = std::move(kernel);
+ ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(_kernel->window(), 1);
+}
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 4fb8300..3a73f1e 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -62,6 +62,7 @@
{
// Perform validation step
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported");
const TensorShape max_shape = TensorShape(input->tensor_shape()).set(0, 1);
const TensorInfo tensor_info_max_sum = TensorInfo(*input).set_tensor_shape(max_shape).reset_padding();
diff --git a/src/runtime/NEON/functions/NEWarpAffine.cpp b/src/runtime/NEON/functions/NEWarpAffine.cpp
index 889d827..105646c 100644
--- a/src/runtime/NEON/functions/NEWarpAffine.cpp
+++ b/src/runtime/NEON/functions/NEWarpAffine.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,11 +32,10 @@
using namespace arm_compute;
-void NEWarpAffine::configure(ITensor *input, ITensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void NEWarpAffine::configure(ITensor *input, ITensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON(nullptr == matrix);
switch(policy)
{
diff --git a/src/runtime/NEON/functions/NEWarpPerspective.cpp b/src/runtime/NEON/functions/NEWarpPerspective.cpp
index ed5d6a0..80b97ce 100644
--- a/src/runtime/NEON/functions/NEWarpPerspective.cpp
+++ b/src/runtime/NEON/functions/NEWarpPerspective.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,11 +32,10 @@
using namespace arm_compute;
-void NEWarpPerspective::configure(ITensor *input, ITensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void NEWarpPerspective::configure(ITensor *input, ITensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON(nullptr == matrix);
switch(policy)
{
diff --git a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
new file mode 100644
index 0000000..097605c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWidthConcatenateLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+NEWidthConcatenateLayer::NEWidthConcatenateLayer()
+ : _concat_kernels_vector(),
+ _num_inputs(0)
+{
+}
+
+Status NEWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
+
+ // Output auto inizialitation if not yet initialized
+ TensorInfo tmp_output_info = *output->clone();
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
+
+ unsigned int width_offset = 0;
+ for(const auto &input : inputs_vector)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
+ width_offset += input->dimension(0);
+ }
+
+ return Status{};
+}
+
+void NEWidthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+{
+ _num_inputs = inputs_vector.size();
+
+ std::vector<ITensorInfo *> inputs_vector_info;
+ for(unsigned int i = 0; i < _num_inputs; i++)
+ {
+ inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+ }
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+ ARM_COMPUTE_ERROR_THROW_ON(NEWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
+
+ unsigned int width_offset = 0;
+
+ _concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEWidthConcatenateLayerKernel[]>(_num_inputs);
+
+ for(unsigned int i = 0; i < _num_inputs; i++)
+ {
+ _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
+ width_offset += inputs_vector.at(i)->info()->dimension(0);
+ }
+}
+
+void NEWidthConcatenateLayer::run()
+{
+ for(unsigned i = 0; i < _num_inputs; i++)
+ {
+ NEScheduler::get().schedule(_concat_kernels_vector.get() + i, Window::DimY);
+ }
+}
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 8f2c4c4..828a593 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -24,16 +24,15 @@
#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/AssemblyHelper.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
#include "support/ToolchainSupport.h"
-#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
-
#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
namespace arm_compute
@@ -60,7 +59,6 @@
ARM_COMPUTE_UNUSED(output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); // COMPMID-1162
ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 3 && weights->dimension(height_idx) != 5, "Only 3 and 5 kernels are supported");
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -107,12 +105,13 @@
return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
}
+
} //namespace
NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr),
- _activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(),
- _workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false)
+ : _memory_group(memory_manager), _asm_glue(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
+ _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(),
+ _is_prepared(false), _is_activationlayer_enabled(false)
{
} /* arm_compute */
@@ -138,9 +137,10 @@
ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
}
- _weights = weights;
- _input = input;
- _output = output;
+ _weights = weights;
+ _input = input;
+ _output = output;
+ _is_prepared = false;
std::unique_ptr<INEWinogradLayerTransformInputKernel<float>> transform_input_kernel;
std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
@@ -155,29 +155,32 @@
{
if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
{
- transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>>();
- transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>>();
- transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>>();
- n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradBase::N_GEMMS;
- N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradConv::N_BLOCK;
+ using config = NEWinogradLayerConfiguration<float, float, 4, 4, 3, 3>;
+ transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
}
else
{
- transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
- transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
- transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
- n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradBase::N_GEMMS;
- N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradConv::N_BLOCK;
+ using config = NEWinogradLayerConfiguration<float, float, 2, 2, 3, 3>;
+ transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
}
break;
}
case 5:
{
- transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
- transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
- transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
- n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradBase::N_GEMMS;
- N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradConv::N_BLOCK;
+ using config = NEWinogradLayerConfiguration<float, float, 2, 2, 5, 5>;
+ transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
break;
}
default:
@@ -195,96 +198,138 @@
const int out_channels = output->info()->dimension(channel_idx);
const Tensor4DShape in_shape(internal_get_input_shape(input));
+ const DataType data_type = input->info()->data_type();
const size_t data_type_size = input->info()->element_size();
// Get the memory required to instantiate a new Winograd operator.
- constexpr size_t storage_alignment = 64;
- const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
- _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
- _kernel_storage.allocator()->allocate();
+ constexpr size_t storage_alignment = 64;
+
+ // Kernel Storage
+ const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
+ in_channels)
+ * data_type_size
+ + storage_alignment - 1;
+
// Input storage
- const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
- _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
- _input_workspace.allocator()->allocate();
+ const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
+ use_same_padding)
+ * data_type_size
+ + storage_alignment - 1;
// Output storage
- const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size;
- _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
- _output_workspace.allocator()->allocate();
+ const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels,
+ use_same_padding)
+ * data_type_size
+ + storage_alignment - 1;
+ ;
+ const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
+ const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
+
+ const int output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+ const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
+
+ const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+
+ // Configure GEMM
+ const int tile_rows = iceildiv(output_shape.n_rows, output_tile.height);
+ const int tile_cols = iceildiv(output_shape.n_cols, output_tile.width);
+ const int m = in_shape.n_batches * tile_rows * tile_cols;
+ const int k = in_shape.n_channels;
+ const int n = out_channels;
+ const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
+ const int output_matrix_row_stride = kernel_matrix_row_stride;
+
+ TensorShape a_shape(k, m, 1, n_gemms);
+ Strides a_strides(data_type_size);
+ a_strides.set(1, a_strides[0] * k);
+ a_strides.set(2, 0);
+ a_strides.set(3, data_type_size * input_matrix_stride);
+
+ TensorShape b_shape(n, k, n_gemms);
+ Strides b_strides(data_type_size);
+ b_strides.set(1, data_type_size * kernel_matrix_row_stride);
+ b_strides.set(2, data_type_size * kernel_matrix_stride);
+
+ TensorShape d_shape(n, m, 1, n_gemms);
+ Strides d_strides(data_type_size);
+ d_strides.set(1, data_type_size * output_matrix_row_stride);
+ d_strides.set(2, 0);
+ d_strides.set(3, data_type_size * output_matrix_stride);
+
+ TensorInfo a_info, b_info, d_info;
+ a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
+ b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
+ d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
+
+ _input_workspace.allocator()->init(a_info, storage_alignment);
+ _kernel_storage.allocator()->init(b_info, storage_alignment);
+ _output_workspace.allocator()->init(d_info, storage_alignment);
// configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
_output->info()->dimension(1), _output->info()->dimension(3)),
1, _output->info()->data_type());
_output_nhwc.allocator()->init(info);
- _output_nhwc.allocator()->allocate();
-
- // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
- _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
- _weights_hwio.allocator()->allocate();
-
- // configure the kernel to transform the input tensor from NCHW -> NHWC
- _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
- _input_nhwc.allocator()->allocate();
-
- const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
// Configure the InputTransform
- const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
- transform_input_kernel->configure(reinterpret_cast<float *>(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
- reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
+ _memory_group.manage(&_input_workspace);
+ if(data_layout == DataLayout::NCHW)
+ {
+ // configure the kernel to transform the input tensor from NCHW -> NHWC
+ _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+ _input_nhwc.allocator()->allocate();
+ transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+ &_input_workspace, input_matrix_stride);
+ }
+ else
+ {
+ transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+ &_input_workspace, input_matrix_stride);
+ }
// Configure WeightsTransform
- const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
- transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
+ if(data_layout == DataLayout::NCHW)
+ {
+ // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+ _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
+
+ transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
+ }
+ else
+ {
+ // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+ _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U));
+
+ transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
+ }
+ _weights_hwio.allocator()->allocate();
// Configure OutputTransform
//The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
- const int output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
- const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
- transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
- output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
- in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
-
- // Configure GEMM
- const int tile_rows = iceildiv(output_shape.n_rows, output_tile.height);
- const int tile_cols = iceildiv(output_shape.n_cols, output_tile.width);
- const int m = in_shape.n_batches * tile_rows * tile_cols;
- const int k = in_shape.n_channels;
- const int n = out_channels;
- const int input_matrix_row_stride = in_shape.n_channels;
- const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
- const int output_matrix_row_stride = kernel_matrix_row_stride;
- unsigned int num_threads = NEScheduler::get().num_threads();
-
- _arm_gemm = arm_gemm::gemm<float, float>(NEScheduler::get().cpu_info(), m, n, k, 1, n_gemms, false, false, 1.f, 0.f, num_threads, false);
- _arm_gemm->set_arrays(reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_row_stride, 0, input_matrix_stride, reinterpret_cast<float *>(_kernel_storage.buffer()),
- kernel_matrix_row_stride, kernel_matrix_stride, reinterpret_cast<float *>(_output_workspace.buffer()), output_matrix_row_stride, 0, output_matrix_stride);
-
- auto acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapper<arm_gemm::GemmCommon<float, float>>>();
- acl_gemm_wrapper->configure(_arm_gemm.get());
- const size_t workspace_size = _arm_gemm->get_working_size();
-
- // Allocate workspace
- if(workspace_size > 0)
+ _memory_group.manage(&_output_workspace);
+ if(data_layout == DataLayout::NCHW)
{
- const unsigned int alignment = 4096;
- allocate_workspace(workspace_size, _workspace, &_memory_group, alignment, 1);
- _arm_gemm->set_working_space(reinterpret_cast<float *>(_workspace.buffer()));
+ transform_output_kernel->configure(biases, &_output_workspace,
+ output_matrix_stride, &_output_nhwc,
+ in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
+ }
+ else
+ {
+ transform_output_kernel->configure(biases, &_output_workspace,
+ output_matrix_stride, _output,
+ in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
}
- const unsigned int window_size = _arm_gemm->get_window_size();
- if(window_size < num_threads)
- {
- num_threads = window_size;
- _arm_gemm->set_nthreads(num_threads);
- }
-
- _gemm_kernel = std::move(acl_gemm_wrapper);
+ _asm_glue.configure(&_input_workspace, &_kernel_storage, &_output_workspace, 1.0f, 0.f, false);
+ _input_workspace.allocator()->allocate();
+ _kernel_storage.allocator()->allocate();
+ _output_workspace.allocator()->allocate();
// Reorder the convoluted output to ACL's ordering NCHW
_permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
+ _output_nhwc.allocator()->allocate();
+
_transform_input_kernel = std::move(transform_input_kernel);
_transform_weights_kernel = std::move(transform_weights_kernel);
_transform_output_kernel = std::move(transform_output_kernel);
@@ -293,38 +338,43 @@
_is_activationlayer_enabled = act_info.enabled();
if(_is_activationlayer_enabled)
{
- _activationlayer_function.configure(output, nullptr, act_info);
+ _activationlayer_function.configure(_output, nullptr, act_info);
}
}
void NEWinogradConvolutionLayer::run()
{
- _memory_group.acquire();
- if(!_reshaped_kernel)
- {
- _reshaped_kernel = true;
- _permute_weights.run();
- NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
- }
- //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
- _permute_input.run();
+ const DataLayout data_layout = _input->info()->data_layout();
+ prepare();
+
+ _memory_group.acquire();
+
+ if(data_layout == DataLayout::NCHW)
+ {
+ //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
+ _permute_input.run();
+ }
// Transform input tensor to the winograd domain
NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
//Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
- NEScheduler::get().schedule(_gemm_kernel.get(), Window::DimX);
+ _asm_glue.run();
// Transform output tensor to the spatial domain
NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
- // Reorder the convoluted output to ACL's ordering NCHW
- _permute_output.run();
+ if(data_layout == DataLayout::NCHW)
+ {
+ // Reorder the convoluted output to ACL's ordering NCHW
+ _permute_output.run();
+ }
if(_is_activationlayer_enabled)
{
_activationlayer_function.run();
}
+
_memory_group.release();
}
@@ -358,6 +408,7 @@
// Validate input transform
const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape);
+
switch(weights->dimension(idx_width))
{
case 3:
@@ -444,7 +495,6 @@
break;
}
}
-
// Validate Activation Layer
if(act_info.enabled())
{
@@ -453,4 +503,20 @@
return Status{};
}
+void NEWinogradConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Permute weights
+ _permute_weights.run();
+ _weights->mark_as_unused();
+
+ // Transform weights
+ NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
+ _weights_hwio.allocator()->free();
+
+ _is_prepared = true;
+ }
+}
+
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
new file mode 100644
index 0000000..b52ce66
--- /dev/null
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager))
+{
+}
+void NEGEMMInterleavedWrapper::run()
+{
+ prepare();
+
+ _memory_group.acquire();
+ NEScheduler::get().run_workloads(_workloads);
+ _memory_group.release();
+}
+
+void NEGEMMInterleavedWrapper::prepare()
+{
+ if(!_is_prepared)
+ {
+ if(_pretranspose_b)
+ {
+ NEScheduler::get().schedule(_prepare_b.get(), Window::DimX);
+ _b->mark_as_unused();
+ }
+ else
+ {
+ _prepare_b->create_workloads(_b_workloads);
+ }
+ _transform_a->create_workloads(_a_workloads);
+ _matrix_multiply->create_workloads(_mm_workloads);
+
+ //Maximum number of workloads to create:
+ const unsigned int num_threads = NEScheduler::get().num_threads();
+ const unsigned int max_iterations = num_threads == 1 ? 1 : num_threads;
+ //Maximum number of iterations the parameters allow:
+ const unsigned int num_iterations = _batch_window.num_iterations_total();
+ // Keep the smallest of the two:
+ const unsigned int num_windows = std::min(num_iterations, max_iterations);
+ const TensorShape window_shape = _batch_window.shape();
+
+ // Create a 1D window to dynamically split the batch window:
+ Window win_1D;
+ win_1D.set(0, Window::Dimension(0, num_iterations));
+
+ // Create one workload for each sub-window:
+ for(unsigned int w = 0; w < num_windows; w++)
+ {
+ Window win = win_1D.split_window(0, w, num_windows);
+ const Coordinates start_offset = index2coords(window_shape, win.x().start());
+ const Coordinates end_offset = index2coords(window_shape, win.x().end() - 1);
+ const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
+
+ auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
+ {
+ //For each block of rows in "M"
+ auto workload_mm = this->_mm_workloads.begin();
+ for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+ {
+ // Transform one k_block from A:
+ this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+ // Then perform the matrix multiplication for each x block along N:
+ for(unsigned int i = 0; i < num_x_blocks; i++)
+ {
+ ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
+ this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+ }
+ }
+ };
+ _workloads.push_back(workload);
+ }
+
+ _is_prepared = true;
+ }
+}
+
+namespace
+{
+// Factory to instantiate NEGEMMInterleavedPrepareBWrapperKernel:
+template <typename InputType, bool use_dot = false>
+std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor *b, ITensor *transformed_b, const INEGEMMWrapperKernel::Params ¶ms)
+{
+ auto prepare_b = support::cpp14::make_unique<NEGEMMInterleavedPrepareBWrapperKernelTemplate<InputType, use_dot>>();
+ prepare_b->configure(b, transformed_b, false, NEScheduler::get().cpu_info(), params);
+ return std::move(prepare_b);
+}
+
+// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate:
+template <typename InputType, bool use_dot = false>
+std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor *a, ITensor *transformed_a, const Window &block_walker, const INEGEMMWrapperKernel::Params ¶ms)
+{
+ auto transform_a = support::cpp14::make_unique<NEGEMMInterleavedTransformAWrapperTemplate<InputType, use_dot>>();
+ transform_a->configure(a, transformed_a, false, block_walker, params);
+ return std::move(transform_a);
+}
+
+// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate:
+template <typename InputType, typename OutputType, bool use_dot = false>
+std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker,
+ const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params ¶ms, bool pretranspose_b, float alpha, float beta)
+{
+ auto matrix_multiply = support::cpp14::make_unique<NEGEMMInterleavedMatrixMultiplyWrapperTemplate<InputType, OutputType, use_dot>>();
+ matrix_multiply->configure(transformed_a, transformed_b, tmp_c, c, block_walker, block_sizes, params, pretranspose_b, alpha, beta, NEScheduler::get().num_threads());
+ return std::move(matrix_multiply);
+}
+} // namespace
+
+void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b, bool use_dot)
+{
+ _params = INEGEMMWrapperKernel::extract_parameters(a, b, c);
+ _a = a;
+ _b = b;
+ _c = c;
+ _pretranspose_b = pretranspose_b;
+
+ DataType input_type = a->info()->data_type();
+
+ // Forcing 128-byte alignment (required by 32-bit kernels)
+ const unsigned int alignment = 128;
+ _transformed_b.allocator()->init(TensorInfo{}, alignment);
+ _tmp_c.allocator()->init(TensorInfo{}, alignment);
+ if(!_pretranspose_b)
+ {
+ // If B is transposed at every iteration then transformed_B can be managed:
+ _memory_group.manage(&_transformed_b);
+ }
+ switch(input_type)
+ {
+ case DataType::F32:
+ _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
+ break;
+#ifdef __aarch64__
+ case DataType::U8:
+ case DataType::QASYMM8:
+ if(use_dot)
+ {
+ _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
+ }
+ else
+ {
+ _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
+ }
+ break;
+ case DataType::S8:
+ if(use_dot)
+ {
+ _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
+ }
+ else
+ {
+ _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
+ }
+ break;
+#endif /* __aarch64__ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
+ break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ default:
+ ARM_COMPUTE_ERROR("DataType not supported");
+ break;
+ }
+ ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+
+ _block_sizes = _prepare_b->block_sizes();
+
+ _block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block));
+ _block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block));
+ _block_walker.set(Window::DimZ, Window::Dimension(0, _params.multis));
+
+ _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
+ _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
+
+ _transformed_a.allocator()->init(TensorInfo(TensorShape{ _block_sizes.k_block, _block_sizes.m_round, _params.batches }, 1, input_type), alignment);
+ _memory_group.manage(&_transformed_a);
+ _memory_group.manage(&_tmp_c);
+
+ switch(input_type)
+ {
+ case DataType::F32:
+ _transform_a = instantiate_transformA<float>(_a, &_transformed_a, _block_walker, _params);
+ _matrix_multiply = instantiate_matrix_multiply<float, float>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
+ break;
+#ifdef __aarch64__
+ case DataType::U8:
+ case DataType::QASYMM8:
+ if(use_dot)
+ {
+ _transform_a = instantiate_transformA<uint8_t, true>(_a, &_transformed_a, _block_walker, _params);
+ _matrix_multiply = instantiate_matrix_multiply<uint8_t, uint32_t, true>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
+ }
+ else
+ {
+ _transform_a = instantiate_transformA<uint8_t, false>(_a, &_transformed_a, _block_walker, _params);
+ _matrix_multiply = instantiate_matrix_multiply<uint8_t, uint32_t, false>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
+ }
+ break;
+ case DataType::S8:
+ if(use_dot)
+ {
+ _transform_a = instantiate_transformA<int8_t, true>(_a, &_transformed_a, _block_walker, _params);
+ _matrix_multiply = instantiate_matrix_multiply<int8_t, int32_t, true>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
+ }
+ else
+ {
+ _transform_a = instantiate_transformA<int8_t, false>(_a, &_transformed_a, _block_walker, _params);
+ _matrix_multiply = instantiate_matrix_multiply<int8_t, int32_t, false>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
+ }
+ break;
+#endif /* __aarch64__ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ _transform_a = instantiate_transformA<__fp16>(_a, &_transformed_a, _block_walker, _params);
+ _matrix_multiply = instantiate_matrix_multiply<__fp16, __fp16>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
+ break;
+ break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ default:
+ break;
+ }
+ ARM_COMPUTE_ERROR_ON(_transform_a == nullptr);
+ ARM_COMPUTE_ERROR_ON(_matrix_multiply == nullptr);
+ _transformed_a.allocator()->allocate();
+ _tmp_c.allocator()->allocate();
+ _transformed_b.allocator()->allocate();
+}
+} // namespace arm_compute