arm_compute v19.11
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 4aeb3a1..00dbb71 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,13 +25,21 @@
#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
+namespace arm_compute
+{
+CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx)
+ : ICLSimpleFunction(ctx)
+{
+}
void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
{
- auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>();
+ auto core_ctx = _ctx ? _ctx->core_runtime_context() : /* Legacy */ nullptr;
+
+ auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>(core_ctx);
k->configure(input, output, act_info);
_kernel = std::move(k);
}
@@ -40,3 +48,4 @@
{
return CLActivationLayerKernel::validate(input, output, act_info);
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index a6393c5..fd172d5 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,26 +23,33 @@
*/
#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
+#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
namespace arm_compute
{
-void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _reduction_function(support::cpp14::make_unique<CLReductionOperation>(std::move(memory_manager)))
{
- auto k = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel>();
- k->configure(input, output, axis, op);
- _kernel = std::move(k);
+}
+
+void CLArgMinMaxLayer::configure(ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+{
+ _reduction_function->configure(input, output, axis, op, false);
}
Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
- return CLReductionOperationKernel::validate(input, output, axis, op);
+ return CLReductionOperation::validate(input, output, axis, op, false);
+}
+
+void CLArgMinMaxLayer::run()
+{
+ _reduction_function->run();
}
} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 4c7458d..dbaea81 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -128,7 +128,7 @@
}
else
{
- ARM_COMPUTE_ERROR("Gradient size %d not supported", gradient_size);
+ ARM_COMPUTE_ERROR_VAR("Gradient size %d not supported", gradient_size);
}
// Manage intermediate buffers
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index b22809e..5e1278d 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -48,7 +48,7 @@
std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
- const TensorShape out_shape(input->info()->tensor_shape()[0], abs(end[0] - start[0]) + 1, abs(end[1] - start[1]) + 1);
+ const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
output->info()->set_tensor_shape(out_shape);
}
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index 63a45aa..eaf7c66 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
namespace arm_compute
{
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index f01b58a..e717f79 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -38,41 +38,386 @@
using namespace arm_compute::misc;
using namespace arm_compute::misc::shape_calculator;
+namespace
+{
+Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
+{
+ // This function should be removed and incorporated inside CLDepthwiseConvolutionLayerInternal3x3 once CLDepthwiseConvolutionLayer3x3 is properly removed
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
+ const bool needs_permute = is_nhwc && (depth_multiplier > 1);
+ const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1) && is_quantized;
+ const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+ DepthwiseConvolutionReshapeInfo info;
+ info.c0 = 4;
+ info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
+
+ TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32));
+ if(is_quantized)
+ {
+ if(is_data_type_quantized_per_channel(weights->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
+
+ const size_t idx_c = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+ output_multipliers_shifts_info.set_tensor_shape(TensorShape(weights->dimension(idx_c)));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ }
+ }
+
+ if(needs_permute)
+ {
+ TensorShape permuted_input_shape = input->tensor_shape();
+ TensorShape permuted_weights_shape = weights->tensor_shape();
+ TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+
+ permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
+ permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
+ permute(permuted_output_shape, PermutationVector(1U, 2U, 0U));
+
+ const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW);
+ const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
+ const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output,
+ conv_info, depth_multiplier, act_info, gpu_target,
+ dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+ }
+ else if(is_nhwc)
+ {
+ if(needs_weights_reshape)
+ {
+ auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases,
+ output, conv_info, depth_multiplier, act_info,
+ dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+ dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target,
+ dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+ }
+ return Status{};
+}
+} // namespace
+
CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _kernel(nullptr), _border_handler(), _permute_input_to_nchw(), _permute_weights_to_nchw(), _permute_output_to_nhwc(), _reshape_weights(), _permuted_input(),
- _permuted_weights(), _permuted_output(), _original_weights(nullptr), _needs_permute(false), _needs_weights_reshape(false), _is_prepared(false)
+ : _func(std::move(memory_manager))
{
}
void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
ActivationLayerInfo act_info, const Size2D &dilation)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- // idx_w and idx_h only used for validation
- const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_UNUSED(idx_w);
- ARM_COMPUTE_UNUSED(idx_h);
+ _func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
+{
+ return validate_arguments_3x3(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation);
+}
- const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
+void CLDepthwiseConvolutionLayer3x3::run()
+{
+ _func.run();
+}
- _needs_permute = is_nhwc && (depth_multiplier > 1);
- _needs_weights_reshape = is_nhwc && (depth_multiplier == 1)
- && is_data_type_quantized_asymmetric(input->info()->data_type());
+void CLDepthwiseConvolutionLayer3x3::prepare()
+{
+ _func.prepare();
+}
+
+CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConvolutionLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)),
+ _dwc_native_kernel(),
+ _permute_input_to_nhwc(),
+ _permute_weights_to_nhwc(),
+ _permute_output_to_nchw(),
+ _permuted_input(),
+ _permuted_weights(),
+ _permuted_output(),
+ _output_multipliers(),
+ _output_shifts(),
+ _original_weights(),
+ _input(),
+ _output(),
+ _needs_permute(false),
+ _is_prepared(false),
+ _is_quantized(false)
+{
+}
+
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(),
+ weights->info(),
+ biases != nullptr ? biases->info() : nullptr,
+ output->info(),
+ conv_info,
+ depth_multiplier,
+ act_info,
+ dilation));
+
+ _is_quantized = is_data_type_quantized(input->info()->data_type());
_is_prepared = false;
_original_weights = weights;
+ _input = input;
+ _output = output;
+ _needs_permute = input->info()->data_layout() == DataLayout::NCHW;
+
+ ICLTensor *input_to_use = input;
+ const ICLTensor *weights_to_use = weights;
+ ICLTensor *output_to_use = output;
+ if(_needs_permute)
+ {
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NCHW -> NHWC
+ _permute_input_to_nhwc.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+ _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+
+ // Configure the function to transform the weights tensor from IHW -> HWI
+ _permute_weights_to_nhwc.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+
+ // Set output quantization info before dwc kernel configure
+ _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+
+ input_to_use = &_permuted_input;
+ weights_to_use = &_permuted_weights;
+ output_to_use = &_permuted_output;
+ }
+
+ CLTensor *output_multipliers_to_use = nullptr;
+ CLTensor *output_shifts_to_use = nullptr;
+ if(_is_quantized)
+ {
+ const size_t idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
+
+ _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+ _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+
+ output_multipliers_to_use = &_output_multipliers;
+ output_shifts_to_use = &_output_shifts;
+ }
+
+ DWCWeightsKernelInfo dwc_weights_info;
+ dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
+ DWCKernelInfo dwc_info;
+ dwc_info.activation_info = act_info;
+ _dwc_native_kernel.configure(input_to_use, weights_to_use, biases, output_to_use,
+ dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
+ output_multipliers_to_use, output_shifts_to_use);
+
+ if(_needs_permute)
+ {
+ _permuted_input.allocator()->allocate();
+
+ // Configure the function to transform the convoluted output to NCHW format
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+ _permute_output_to_nchw.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+ _permuted_output.allocator()->allocate();
+ }
+
+ if(_is_quantized)
+ {
+ _output_multipliers.allocator()->allocate();
+ _output_shifts.allocator()->allocate();
+ }
+}
+
+Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+ const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
+ DWCWeightsKernelInfo dwc_weights_info;
+ dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
+ DWCKernelInfo dwc_info;
+ dwc_info.activation_info = act_info;
+
+ const bool needs_permute = input->data_layout() == DataLayout::NCHW;
+
+ const bool is_quantized = is_data_type_quantized(input->data_type());
+
+ TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32));
+ if(is_quantized)
+ {
+ if(is_data_type_quantized_per_channel(weights->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
+
+ const size_t idx_c = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+ output_multipliers_shifts_info.set_tensor_shape(TensorShape(weights->dimension(idx_c)));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ }
+ }
+
+ if(needs_permute)
+ {
+ TensorShape permuted_input_shape = input->tensor_shape();
+ TensorShape permuted_weights_shape = weights->tensor_shape();
+ TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+
+ permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
+ permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
+ permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
+
+ const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC);
+ const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC);
+ const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NHWC);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, dwc_weights_info,
+ dwc_info, conv_info, depth_multiplier, dilation,
+ &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier,
+ dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+ }
+ return Status{};
+}
+
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ if(_needs_permute)
+ {
+ _permute_input_to_nhwc.run();
+ }
+ CLScheduler::get().enqueue(_dwc_native_kernel);
+ if(_needs_permute)
+ {
+ _permute_output_to_nchw.run();
+ }
+}
+
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare()
+{
+ if(!_is_prepared)
+ {
+ if(_is_quantized)
+ {
+ _output_multipliers.map();
+ _output_shifts.map();
+ const unsigned int idx_ofms = get_data_layout_dimension_index(_output->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ quantization::compute_quantized_multipliers_and_shifts(_input->info(),
+ _original_weights->info(),
+ _output->info(),
+ idx_ofms,
+ reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
+ reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
+ _output_multipliers.unmap();
+ _output_shifts.unmap();
+ }
+
+ if(_needs_permute)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ _permuted_weights.allocator()->allocate();
+ _permute_weights_to_nhwc.run();
+ _original_weights->mark_as_unused();
+ }
+ _is_prepared = true;
+ }
+}
+
+CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)),
+ _kernel(nullptr),
+ _border_handler(),
+ _permute_input_to_nchw(),
+ _permute_weights_to_nchw(),
+ _permute_output_to_nhwc(),
+ _reshape_weights(),
+ _permuted_input(),
+ _permuted_weights(),
+ _permuted_output(),
+ _output_multipliers(),
+ _output_shifts(),
+ _original_weights(nullptr),
+ _input(nullptr),
+ _output(nullptr),
+ _needs_permute(false),
+ _needs_weights_reshape(false),
+ _is_prepared(false),
+ _is_quantized(false)
+{
+}
+
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+ const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+{
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer3x3::validate(input->info(),
+ weights->info(),
+ biases != nullptr ? biases->info() : nullptr,
+ output->info(),
+ conv_info,
+ depth_multiplier,
+ act_info,
+ gpu_target,
+ dilation));
+
+ const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _needs_permute = is_nhwc && (depth_multiplier > 1);
+ _needs_weights_reshape = is_nhwc && (depth_multiplier == 1) && _is_quantized;
+
+ _is_prepared = false;
+ _original_weights = weights;
+ _input = input;
+ _output = output;
ICLTensor *input_to_use = input;
const ICLTensor *weights_to_use = weights;
ICLTensor *output_to_use = output;
- const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
- const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+ const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
+ const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel;
+ const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
DepthwiseConvolutionReshapeInfo info;
info.c0 = 4;
@@ -112,9 +457,30 @@
_kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
}
+ CLTensor *output_multipliers_to_use = nullptr;
+ CLTensor *output_shifts_to_use = nullptr;
+ if(_is_quantized)
+ {
+ const size_t idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t num_filters = (is_quantized_per_channel) ? weights->info()->dimension(idx_c) : 1;
+
+ _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+ _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+
+ output_multipliers_to_use = &_output_multipliers;
+ output_shifts_to_use = &_output_shifts;
+ }
+
// Configure kernel
- _kernel->set_target(CLScheduler::get().target());
- _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info, dilation);
+ _kernel->set_target(gpu_target);
+ _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier,
+ act_info, dilation, output_multipliers_to_use, output_shifts_to_use);
+
+ if(_is_quantized)
+ {
+ _output_multipliers.allocator()->allocate();
+ _output_shifts.allocator()->allocate();
+ }
// Permute output if needed
if(_needs_permute)
@@ -136,73 +502,13 @@
_border_handler.configure(input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
}
-Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
+Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+ const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-
- const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
- const bool needs_permute = is_nhwc && (depth_multiplier > 1);
- const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1) && is_quantized;
- const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
- const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
- const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
- DepthwiseConvolutionReshapeInfo info;
- info.c0 = 4;
- info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
-
- if(is_quantized)
- {
- const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = (output->total_size() == 0) ? iq_info : output->quantization_info().uniform();
-
- const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
- ARM_COMPUTE_UNUSED(multiplier);
- ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
- }
-
- if(needs_permute)
- {
- TensorShape permuted_input_shape = input->tensor_shape();
- TensorShape permuted_weights_shape = weights->tensor_shape();
- TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-
- permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
- permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
- permute(permuted_output_shape, PermutationVector(1U, 2U, 0U));
-
- const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW);
- const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
- const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target,
- dilation));
- }
- else if(is_nhwc)
- {
- if(needs_weights_reshape)
- {
- auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
- act_info, dilation));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation));
- }
-
- return Status{};
+ return validate_arguments_3x3(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation);
}
-void CLDepthwiseConvolutionLayer3x3::run()
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::run()
{
prepare();
@@ -221,10 +527,25 @@
}
}
-void CLDepthwiseConvolutionLayer3x3::prepare()
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepare()
{
if(!_is_prepared)
{
+ if(_is_quantized)
+ {
+ _output_multipliers.map();
+ _output_shifts.map();
+ const unsigned int idx_ofms = get_data_layout_dimension_index(_output->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ quantization::compute_quantized_multipliers_and_shifts(_input->info(),
+ _original_weights->info(),
+ _output->info(),
+ idx_ofms,
+ reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
+ reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
+ _output_multipliers.unmap();
+ _output_shifts.unmap();
+ }
+
if(_needs_permute)
{
ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
@@ -246,259 +567,92 @@
}
}
-CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
- : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _activationlayer_function(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(),
- _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr),
- _optimised_function(nullptr)
+CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_manager(std::move(memory_manager)), _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_3x3(), _func_generic()
{
}
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+ ActivationLayerInfo act_info, const Size2D &dilation)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-
- const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
-
- const bool can_run_optimised_3x3_kernel = (weights->info()->dimension(idx_w) == 3) && (weights->info()->dimension(idx_h) == 3);
-
- if(bool(can_run_optimised_3x3_kernel))
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info,
+ dilation, gpu_target);
+ switch(_depth_conv_func)
{
- auto f = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3>();
- f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
- _optimised_function = std::move(f);
- }
- else
- {
- const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
-
- const size_t weights_w = weights->info()->dimension(idx_w);
- const size_t weights_h = weights->info()->dimension(idx_h);
- const size_t weights_z = weights->info()->dimension(idx_c);
-
- _is_prepared = false;
- _original_weights = weights;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-
- bool append_bias = (biases != nullptr) && !_is_quantized;
- const GPUTarget gpu_target = CLScheduler::get().target();
-
- // Calculate output shape
- TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
- // Output width and height
- const unsigned int conv_w = output_shape[idx_w];
- const unsigned int conv_h = output_shape[idx_h];
-
- // Set up intermediate tensors
- const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
- const size_t conv_size = conv_w * conv_h;
-
- const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
- // Im2Col configuration
- TensorShape shape_im2col = input->info()->tensor_shape();
- shape_im2col.set(0, patch_size);
- shape_im2col.set(1, conv_size);
- shape_im2col.set(2, weights_z);
- _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- _im2col_kernel.set_target(gpu_target);
- _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
- CLScheduler::get().tune_kernel_static(_im2col_kernel);
-
- // Weights reshape configuration
- const TensorShape shape_weights_reshape(patch_size, weights_z);
- _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
- _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
-
- // GEMV configuration
- DataType v2mm_dt = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
- TensorShape shape_v2mm_out = input->info()->tensor_shape();
- shape_v2mm_out.set(0, conv_size * weights_z);
- shape_v2mm_out.set(1, 1);
- shape_v2mm_out.set(2, 1);
- _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
- _v2mm_kernel.set_target(gpu_target);
- _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
- CLScheduler::get().tune_kernel_static(_v2mm_kernel);
- _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
- _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
-
- // Output staged configuration
- if(_is_quantized)
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ _func_3x3.set_memory_group(_memory_manager);
+ _func_3x3.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
{
- const UniformQuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info;
-
- int output_multiplier = 0;
- int output_shift = 0;
- const float multiplier = iq_info.scale * wq_info.scale / output_quant_info.scale;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
- _output_reshaped.allocator()->allocate();
+ _func_generic.set_memory_group(_memory_manager);
+ _func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
}
-
- // Fill borders on inputs
- PixelValue zero_in(static_cast<int32_t>(0));
- PixelValue zero_w(static_cast<int32_t>(0));
- if(_is_quantized)
- {
- zero_in = PixelValue(static_cast<int32_t>(iq_info.offset));
- zero_w = PixelValue(static_cast<int32_t>(wq_info.offset));
- }
- BorderSize border_size = _v2mm_kernel.border_size();
- _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
-
- border_size.bottom = 0;
- _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
-
- // Allocate intermediate tensors
- _input_reshaped.allocator()->allocate();
- _v2mm_output.allocator()->allocate();
-
- //Configure Activation Layer
- _is_activationlayer_enabled = act_info.enabled();
-
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(output, nullptr, act_info);
- }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
}
}
Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+ unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
{
- const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
-
- const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
-
- if(!can_run_optimised_3x3_kernel)
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, gpu_target);
+ switch(depth_conv_func)
{
- const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ return CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation);
+ case DepthwiseConvolutionFunction::GENERIC:
+ return CLDepthwiseConvolutionLayerGeneric::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ default:
+ ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+ }
+}
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
-
- const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- const bool append_bias = (biases != nullptr) && !is_quantized;
- const TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
- const size_t weights_w = weights->dimension(idx_w);
- const size_t weights_h = weights->dimension(idx_h);
- const size_t weights_z = weights->dimension(idx_c);
- const unsigned int conv_w = output_shape[idx_w];
- const unsigned int conv_h = output_shape[idx_h];
- const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
- const size_t conv_size = conv_w * conv_h;
-
- TensorShape shape_im2col = input->tensor_shape();
- shape_im2col.set(0, patch_size);
- shape_im2col.set(1, conv_size);
- shape_im2col.set(2, weights_z);
- TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
-
- const TensorShape shape_weights_reshape(patch_size, weights_z);
- TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
-
- DataType v2mm_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
- TensorShape shape_v2mm_out = input->tensor_shape();
- shape_v2mm_out.set(0, conv_size * weights_z);
- shape_v2mm_out.set(1, 1);
- shape_v2mm_out.set(2, 1);
- TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
-
- TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
-
- if(is_quantized)
- {
- const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = (output->total_size() == 0) ? iq_info : output->quantization_info().uniform();
-
- const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
- ARM_COMPUTE_UNUSED(multiplier);
- ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
- ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
- }
-
- // Validate Activation Layer
- if(act_info.enabled())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
- }
+DepthwiseConvolutionFunction CLDepthwiseConvolutionLayer::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation, GPUTarget gpu_target)
+{
+ if(bool(CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation)) && (is_data_type_float(input->data_type())
+ || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD))
+ {
+ return DepthwiseConvolutionFunction::OPTIMIZED;
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation));
+ return DepthwiseConvolutionFunction::GENERIC;
}
- return Status{};
}
void CLDepthwiseConvolutionLayer::run()
{
- prepare();
-
- if(_optimised_function != nullptr)
+ switch(_depth_conv_func)
{
- _optimised_function->run();
- }
- else
- {
- CLScheduler::get().enqueue(_im2col_kernel);
- CLScheduler::get().enqueue(_v2mm_input_fill_border);
- CLScheduler::get().enqueue(_v2mm_kernel);
- CLScheduler::get().enqueue(_vector_to_tensor_kernel);
- if(_is_quantized)
- {
- CLScheduler::get().enqueue(_output_stage_kernel);
- }
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.run();
- }
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ _func_3x3.run();
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
+ _func_generic.run();
+ break;
+ default:
+ ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
}
}
void CLDepthwiseConvolutionLayer::prepare()
{
- if(_optimised_function != nullptr)
+ switch(_depth_conv_func)
{
- _optimised_function->prepare();
- }
- else
- {
- if(!_is_prepared)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- // Run weights reshaping and mark original weights tensor as unused
- _weights_reshaped.allocator()->allocate();
- CLScheduler::get().enqueue(_weights_reshape_kernel);
- CLScheduler::get().enqueue(_v2mm_weights_fill_border);
- _original_weights->mark_as_unused();
-
- CLScheduler::get().queue().finish();
- _is_prepared = true;
- }
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ _func_3x3.prepare();
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
+ _func_generic.prepare();
+ break;
+ default:
+ ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
}
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
deleted file mode 100644
index fa2c3af..0000000
--- a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLDepthwiseSeparableConvolutionLayer::CLDepthwiseSeparableConvolutionLayer()
- : _depthwise_conv(), _pointwise_conv()
-{
-}
-
-void CLDepthwiseSeparableConvolutionLayer::configure(ICLTensor *input, const ICLTensor *depthwise_weights, const ICLTensor *depthwise_biases, ICLTensor *depthwise_out,
- const ICLTensor *pointwise_weights, const ICLTensor *pointwise_biases, ICLTensor *output,
- const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
-{
- _depthwise_conv.configure(input, depthwise_weights, depthwise_biases, depthwise_out, depthwise_conv_info);
- _pointwise_conv.configure(depthwise_out, pointwise_weights, pointwise_biases, output, pointwise_conv_info);
-}
-
-void CLDepthwiseSeparableConvolutionLayer::run()
-{
- prepare();
-
- _depthwise_conv.run();
- _pointwise_conv.run();
-}
-
-void CLDepthwiseSeparableConvolutionLayer::prepare()
-{
- _depthwise_conv.prepare();
- _pointwise_conv.prepare();
-}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index c1a39ef..b8089d8 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -63,13 +63,8 @@
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
- ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
-
- auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
- info.pad().first, info.pad().second, stride_x, stride_y);
+ auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), info);
const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
@@ -92,9 +87,11 @@
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
- unsigned int padx = 0;
- unsigned int pady = 0;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, padx, pady);
+ unsigned int deconv_pad_x = 0;
+ unsigned int deconv_pad_y = 0;
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
@@ -109,6 +106,10 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ const unsigned int pad_left = info.pad_left();
+ const unsigned int pad_right = info.pad_right();
+ const unsigned int pad_top = info.pad_top();
+ const unsigned int pad_bottom = info.pad_bottom();
const unsigned int stride_x = info.stride().first;
const unsigned int stride_y = info.stride().second;
@@ -122,8 +123,7 @@
_weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
_flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
- auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h),
- info.pad().first, info.pad().second, stride_x, stride_y);
+ auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
@@ -138,16 +138,30 @@
_memory_group.manage(&_scaled_output);
// Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
- unsigned int padx = 0;
- unsigned int pady = 0;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, padx, pady);
+ unsigned int deconv_pad_x = 0;
+ unsigned int deconv_pad_y = 0;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
+
+ unsigned int deconv_pad_left = pad_right > pad_left ? pad_right - pad_left : 0;
+ unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
+ deconv_pad_x -= deconv_pad_left + deconv_pad_right;
+ ARM_COMPUTE_ERROR_ON((deconv_pad_x % 2) != 0);
+ deconv_pad_left += deconv_pad_x / 2;
+ deconv_pad_right += deconv_pad_x / 2;
+
+ unsigned int deconv_pad_top = pad_bottom > pad_top ? pad_bottom - pad_top : 0;
+ unsigned int deconv_pad_bottom = pad_top > pad_bottom ? pad_top - pad_bottom : 0;
+ deconv_pad_y -= deconv_pad_top + deconv_pad_bottom;
+ ARM_COMPUTE_ERROR_ON((deconv_pad_y % 2) != 0);
+ deconv_pad_top += deconv_pad_y / 2;
+ deconv_pad_bottom += deconv_pad_y / 2;
TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
scale_out_info.set_data_layout(data_layout);
_scaled_output.allocator()->init(scale_out_info);
// configure scale function
- const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
+ const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
_scale_f.configure(input, &_scaled_output, upsample_info);
// Setup the function to convolve the upscaled output
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index c5da649..a8167ce 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/Cast.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
@@ -32,13 +33,64 @@
#include <algorithm>
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::utils::cast;
namespace
{
-Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output,
+ GEMMLowpOutputStageInfo &gemmlowp_output_stage)
{
+ gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ gemmlowp_output_stage.gemmlowp_offset = 0;
+ gemmlowp_output_stage.gemmlowp_multiplier = 0;
+ gemmlowp_output_stage.gemmlowp_shift = 0;
+
+ // Configure output stage for quantized case
+ if(is_data_type_quantized_asymmetric(input.data_type()))
+ {
+ const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = output.quantization_info().uniform();
+
+ const auto output_quant_info = (output.total_size() == 0) ? iq_info : oq_info;
+
+ const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
+
+ // Set the GEMMLowp output stage info
+ gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
+ gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
+ gemmlowp_output_stage.gemmlowp_shift = output_shift;
+ gemmlowp_output_stage.gemmlowp_min_bound = 0;
+ gemmlowp_output_stage.gemmlowp_max_bound = 255;
+ gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier);
+ gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift);
+ }
+
+ return Status{};
+}
+
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info)
+{
+ GEMMLowpOutputStageInfo gemmlowp_output_stage;
+ ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage));
+
+ const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
+ false, // is_b_reshaped
+ true, // reshape_b_only_on_first_run
+ 0, // depth_output_gemm3d
+ false, // reinterpret_input_as_3d
+ fc_info.retain_internal_weights, // retain_internal_weights
+ gemmlowp_output_stage, // gemmlowp_output_stage
+ fc_info.fp_mixed_precision, // fp_mixed_precision
+ true, // broadcast_bias
+ ActivationLayerInfo()); // activation_info
+
if(is_data_type_quantized_asymmetric(input.data_type()))
{
const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
@@ -52,12 +104,13 @@
// Validate gemmlowp function
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info),
&weights.clone()->set_quantization_info(weights_quantization_info),
- nullptr,
- &output));
+ bias,
+ &output,
+ gemm_info));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info));
}
return Status{};
@@ -76,14 +129,28 @@
return CLTransposeKernel::validate(input, output);
}
-CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _convert_weights(), _flatten_layer(), _reshape_weights_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
- _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
- _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
+CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+ : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), _reshape_weights_function(),
+ _mm_gemm(memory_manager, weights_manager), _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true),
+ _are_weights_reshaped(true), _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
{
}
-void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights)
+void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info)
{
+ GEMMLowpOutputStageInfo gemmlowp_output_stage;
+ construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), gemmlowp_output_stage);
+
+ const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
+ false, // is_b_reshaped
+ true, // reshape_b_only_on_first_run
+ 0, // depth_output_gemm3d
+ false, // reinterpret_input_as_3d
+ fc_info.retain_internal_weights, // retain_internal_weights
+ gemmlowp_output_stage, // gemmlowp_output_stage
+ fc_info.fp_mixed_precision, // fp_mixed_precision
+ true, // broadcast_bias
+ ActivationLayerInfo()); // activation_info
+
if(_is_quantized)
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -95,7 +162,7 @@
weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
// Configure gemmlowp function
- _mm_gemmlowp.configure(input, weights, nullptr, output);
+ _mm_gemmlowp.configure(input, weights, bias, output, gemm_info);
// Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
input->info()->set_quantization_info(input_quantization_info);
@@ -104,11 +171,11 @@
else
{
// Configure matrix multiply kernel
- _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */, 0, false, retain_internal_weights));
+ _mm_gemm.configure(input, weights, bias, output, 1.f, 1.f, gemm_info);
}
}
-void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights)
+void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info)
{
ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
@@ -123,18 +190,18 @@
_flatten_layer.configure(input, &_flatten_output);
// Configure matrix multiply kernel
- configure_mm(&_flatten_output, weights, output, retain_internal_weights);
+ configure_mm(&_flatten_output, weights, bias, output, fc_info);
// Allocate the output tensor for flatten once all the configure methods have been called
_flatten_output.allocator()->allocate();
}
-void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights)
+void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info)
{
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
// Configure matrix multiply kernel
- configure_mm(input, weights, output, retain_internal_weights);
+ configure_mm(input, weights, bias, output, fc_info);
}
void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
@@ -152,27 +219,13 @@
_are_weights_converted = true;
_are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
_is_fc_after_conv = true;
- _accumulate_biases = false;
_is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
_is_prepared = fc_info.retain_internal_weights;
_original_weights = weights;
- // Configure gemmlowp output
- if(_is_quantized)
+ if(_weights_manager)
{
- _gemmlowp_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
- }
-
- // Configure accumulate biases kernel for non quantized asymmetric types
- if(biases != nullptr && !_is_quantized)
- {
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-
- _accumulate_biases = true;
-
- // Configure accumulate biases kernel
- _accumulate_biases_kernel.set_target(CLScheduler::get().target());
- _accumulate_biases_kernel.configure(output, biases);
+ _weights_manager->manage(weights);
}
const ICLTensor *weights_to_use = weights;
@@ -199,50 +252,51 @@
// Reshape weights if needed
if(!_are_weights_reshaped)
{
- // Reshape the weights
- _reshape_weights_kernel.configure(weights, &_reshape_weights_output);
- weights_to_use = &_reshape_weights_output;
+ if(_weights_manager && _weights_manager->are_weights_managed(weights))
+ {
+ _reshape_weights_managed_function.configure(weights);
+ weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed_function));
+ }
+ else
+ {
+ // Reshape the weights
+ _reshape_weights_function.configure(weights, &_reshape_weights_output);
+ weights_to_use = &_reshape_weights_output;
+ }
}
// Convert weights if needed
if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
{
- // Convert weights
- _convert_weights.configure(weights_to_use,
- &_converted_weights_output,
- input->info()->tensor_shape(),
- fc_info.weights_trained_layout);
+ if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use))
+ {
+ _convert_weights_managed.configure(weights_to_use,
+ input->info()->tensor_shape(),
+ fc_info.weights_trained_layout);
+ weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_convert_weights_managed));
+ }
+ else
+ {
+ // Convert weights
+ _convert_weights.configure(weights_to_use,
+ &_converted_weights_output,
+ input->info()->tensor_shape(),
+ fc_info.weights_trained_layout);
- weights_to_use = &_converted_weights_output;
+ weights_to_use = &_converted_weights_output;
+ }
_are_weights_converted = false;
}
- // Configure fc core
- ICLTensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
if(_is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer without batches
- configure_conv_fc(input, weights_to_use, tmp_output, fc_info.retain_internal_weights);
+ configure_conv_fc(input, weights_to_use, biases, output, fc_info);
}
else
{
// Fully Connected layer after a Fully Connected Layer without batches
- configure_fc_fc(input, weights_to_use, tmp_output, fc_info.retain_internal_weights);
- }
-
- // Configure output stage for asymmetric quantized types
- if(_is_quantized)
- {
- const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
- float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
- int output_multiplier;
- int output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, oq_info.offset);
- _gemmlowp_output.allocator()->allocate();
+ configure_fc_fc(input, weights_to_use, biases, output, fc_info);
}
}
@@ -254,22 +308,12 @@
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
- bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
- bool is_fc_after_conv = true;
- bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- const GPUTarget gpu_target = CLScheduler::get().target();
+ bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+ bool is_fc_after_conv = true;
const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)).set_data_layout(DataLayout::NCHW));
const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
- const ITensorInfo &gemmlowp_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-
- // Configure accumulate biases kernel for non quantized asymmetric types
- if(biases != nullptr && !is_quantized)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target));
- }
// With the Fully Connected layer we can have 4 different cases:
// 1) Convolution layer -> Fully Connected layer without batches
@@ -279,7 +323,6 @@
const ITensorInfo *input_to_use = input;
const ITensorInfo *weights_to_use = weights;
- const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output;
// Check if we have a fully connected layer with batches
const bool is_batched_fc_layer = output->dimension(1) > 1;
@@ -325,21 +368,9 @@
// Fully Connected layer after a Fully Connected Layer without batches
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
}
+
// Validate matrix multiply kernel
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
-
- // Validate output stage for asymmetric quantized types
- if(is_quantized)
- {
- const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
- const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
-
- ARM_COMPUTE_UNUSED(multiplier);
- ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&gemmlowp_output, biases, output));
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info));
return Status{};
}
@@ -365,26 +396,16 @@
{
_mm_gemm.run();
}
-
- // Accumulate biases if provided
- if(_is_quantized)
- {
- _gemmlowp_output_stage.run();
- }
- else
- {
- if(_accumulate_biases)
- {
- CLScheduler::get().enqueue(_accumulate_biases_kernel);
- }
- }
}
void CLFullyConnectedLayer::prepare()
{
if(!_is_prepared)
{
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+ if(!_weights_manager)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+ }
auto release_unused = [](CLTensor * w)
{
@@ -401,22 +422,36 @@
// Reshape of the weights if needed (happens only once)
if(!_are_weights_reshaped)
{
- // Run reshape weights kernel and mark weights as unused
- _reshape_weights_output.allocator()->allocate();
- _reshape_weights_kernel.run();
+ if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+ {
+ cur_weights = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->run(cur_weights, &_reshape_weights_managed_function));
+ }
+ else
+ {
+ // Run reshape weights kernel and mark weights as unused
+ _reshape_weights_output.allocator()->allocate();
+ _reshape_weights_function.run();
- cur_weights->mark_as_unused();
- cur_weights = &_reshape_weights_output;
+ cur_weights->mark_as_unused();
+ cur_weights = &_reshape_weights_output;
+ }
_are_weights_reshaped = true;
}
// Convert weights if needed (happens only once)
if(!_are_weights_converted)
{
- _converted_weights_output.allocator()->allocate();
- _convert_weights.run();
+ if(_weights_manager && _weights_manager->are_weights_managed(cur_weights))
+ {
+ _weights_manager->run(cur_weights, &_convert_weights_managed);
+ }
+ else
+ {
+ _converted_weights_output.allocator()->allocate();
+ _convert_weights.run();
+ cur_weights->mark_as_unused();
+ }
- cur_weights->mark_as_unused();
_are_weights_converted = true;
}
@@ -436,3 +471,4 @@
_is_prepared = true;
}
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index e78395f..8d46014 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -36,6 +36,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/helpers/float_ops.h"
+#include "arm_compute/core/utils/misc/Cast.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/ITensorAllocator.h"
@@ -44,12 +45,15 @@
{
using namespace arm_compute::misc::shape_calculator;
using namespace arm_compute::cl_gemm;
+using namespace arm_compute::utils::cast;
-CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
: _memory_group(std::move(memory_manager)),
+ _weights_manager(weights_manager),
_mm_kernel(),
_reshape_lhs_kernel(),
_reshape_rhs_kernel(),
+ _reshape_rhs_kernel_managed(),
_mm_reshaped_kernel(),
_mm_reshaped_only_rhs_kernel(),
_tmp_a(),
@@ -65,37 +69,53 @@
{
GEMMType gemm_type = GEMMType::RESHAPED_V1;
- if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
+ if(gpu_target_is_in(gpu_target, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+ GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72,
+ GPUTarget::G76, GPUTarget::G77))
{
- if((m > 1) && (n < 16))
+ if(data_type == DataType::F32)
{
- gemm_type = GEMMType::RESHAPED_V1;
- }
- else if((m == 1) && (data_type == DataType::F32))
- {
- gemm_type = GEMMType::RESHAPED_ONLY_RHS;
- }
- else
- {
- // COMPMID-852
- if((k > 256) && (m > 4) && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+ if((m > 1) && (n < 16))
{
- constexpr float alpha = 3.2f;
- constexpr float fact0 = 1.51f;
- constexpr float fact1 = 1.66f;
- constexpr float ops = 12.0f;
- const float scale = k > 1024 ? 1.07f : 1.0f;
- gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
+ gemm_type = GEMMType::RESHAPED_V1;
+ }
+ else if(m == 1)
+ {
+ gemm_type = GEMMType::RESHAPED_ONLY_RHS;
}
else
{
- gemm_type = GEMMType::NATIVE;
+ // COMPMID-852
+ if((k > 256) && (m > 4) && reshape_b_only_on_first_run)
+ {
+ constexpr float alpha = 3.2f;
+ constexpr float fact0 = 1.51f;
+ constexpr float fact1 = 1.66f;
+ constexpr float ops = 12.0f;
+ const float scale = k > 1024 ? 1.07f : 1.0f;
+ gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
+ }
+ else
+ {
+ gemm_type = GEMMType::NATIVE;
+ }
+ }
+
+ const auto workload = static_cast<float>((m * n) / 20.0f);
+
+ gemm_type = ((workload > 1600.0f) && (gemm_type == GEMMType::RESHAPED_V1) && (data_type == DataType::F32)) ? GEMMType::RESHAPED_V2 : gemm_type;
+ }
+ else
+ {
+ if((m == 1) || (!reshape_b_only_on_first_run))
+ {
+ gemm_type = GEMMType::RESHAPED_ONLY_RHS;
+ }
+ else
+ {
+ gemm_type = GEMMType::RESHAPED_V2;
}
}
-
- const auto workload = static_cast<float>((m * n) / 20.0f);
-
- gemm_type = ((workload > 1600.0f) && (gemm_type == GEMMType::RESHAPED_V1) && (data_type == DataType::F32)) ? GEMMType::RESHAPED_V2 : gemm_type;
}
else
{
@@ -162,8 +182,12 @@
GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
+ const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
+
+ // Manage intermediate buffers
_memory_group.manage(&_tmp_a);
- if(!_reshape_b_only_on_first_run)
+
+ if(!_reshape_b_only_on_first_run && use_mm_b)
{
_memory_group.manage(&_tmp_b);
}
@@ -172,16 +196,26 @@
_reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
// Configure transpose kernel
- _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+ ICLTensor *reshaped_rhs = &_tmp_b;
+ if(_weights_manager && _weights_manager->are_weights_managed(b))
+ {
+ _reshape_rhs_kernel_managed.configure(b, rhs_info);
+ reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+ }
+ else
+ {
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+ }
// Configure and tune matrix multiply kernel
- _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
+ _mm_kernel.configure(&_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
CLScheduler::get().tune_kernel_static(_mm_kernel);
// Allocate intermediate tensors
_tmp_a.allocator()->allocate();
- if(!_reshape_b_only_on_first_run)
+
+ if(!_reshape_b_only_on_first_run && use_mm_b)
{
_tmp_b.allocator()->allocate();
}
@@ -212,12 +246,16 @@
_reshape_lhs_kernel.set_target(gpu_target);
_mm_kernel.set_target(gpu_target);
+ const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
+
// Manage intermediate buffers
_memory_group.manage(&_tmp_a);
- if(!_reshape_b_only_on_first_run)
+
+ if(!_reshape_b_only_on_first_run && use_mm_b)
{
_memory_group.manage(&_tmp_b);
}
+
// _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
GEMMLHSMatrixInfo lhs_info{};
@@ -231,14 +269,25 @@
std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
_reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
- _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+ ICLTensor *reshaped_rhs = &_tmp_b;
+ if(_weights_manager && _weights_manager->are_weights_managed(b))
+ {
+ _reshape_rhs_kernel_managed.configure(b, rhs_info);
+ reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+ }
+ else
+ {
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+ }
// Configure and tune matrix multiply kernel
- _mm_reshaped_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+ _mm_reshaped_kernel.configure(&_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
// Allocate intermediate tensors
_tmp_a.allocator()->allocate();
- if(!_reshape_b_only_on_first_run)
+
+ if(!_reshape_b_only_on_first_run && use_mm_b)
{
_tmp_b.allocator()->allocate();
}
@@ -268,8 +317,10 @@
// Set the target for the kernels
_mm_kernel.set_target(gpu_target);
+ const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
+
// Manage intermediate buffers
- if(!_reshape_b_only_on_first_run)
+ if(!_reshape_b_only_on_first_run && use_mm_b)
{
_memory_group.manage(&_tmp_b);
}
@@ -284,12 +335,21 @@
// Configure lhs_info and rhs_info
std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
- _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+ ICLTensor *reshaped_rhs = &_tmp_b;
+ if(_weights_manager && _weights_manager->are_weights_managed(b))
+ {
+ _reshape_rhs_kernel_managed.configure(b, rhs_info);
+ reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+ }
+ else
+ {
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+ }
// Configure and tune matrix multiply kernel
- _mm_reshaped_only_rhs_kernel.configure(a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+ _mm_reshaped_only_rhs_kernel.configure(a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
- if(!_reshape_b_only_on_first_run)
+ if(!_reshape_b_only_on_first_run && use_mm_b)
{
_tmp_b.allocator()->allocate();
}
@@ -591,7 +651,14 @@
if(!_reshape_b_only_on_first_run)
{
// Run transpose kernel
- CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+ if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
+ {
+ _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+ }
}
CLScheduler::get().enqueue(_mm_kernel, true);
@@ -605,7 +672,14 @@
if(!_reshape_b_only_on_first_run)
{
// Run transpose kernel
- CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+ if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
+ {
+ _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+ }
}
CLScheduler::get().enqueue(_mm_reshaped_kernel, true);
@@ -616,7 +690,14 @@
if(!_reshape_b_only_on_first_run)
{
// Run transpose kernel
- CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+ if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
+ {
+ _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+ }
}
CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, true);
@@ -635,10 +716,17 @@
{
if(_gemm_type != GEMMType::NATIVE && _reshape_b_only_on_first_run)
{
- // Run transpose kernel and mark original weights tensor as unused
- _tmp_b.allocator()->allocate();
- CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
- _original_b->mark_as_unused();
+ if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
+ {
+ _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+ }
+ else
+ {
+ // Run transpose kernel and mark original weights tensor as unused
+ _tmp_b.allocator()->allocate();
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+ _original_b->mark_as_unused();
+ }
}
CLScheduler::get().queue().finish();
_is_prepared = true;
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index be6be04..d322723 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/Cast.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
@@ -35,8 +36,10 @@
#include <memory>
#include <tuple>
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::utils::cast;
CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
: _weights_reshape_kernel()
@@ -63,13 +66,14 @@
Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
if(biases != nullptr)
{
const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type()));
+ ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(weights->data_type()));
+
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
@@ -78,7 +82,6 @@
if((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
-
CLWeightsReshapeKernel::validate(weights, biases, output, num_groups);
}
@@ -90,9 +93,10 @@
CLScheduler::get().enqueue(_weights_reshape_kernel);
}
-CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(),
- _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false)
+CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+ : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager, weights_manager),
+ _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false),
+ _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false)
{
}
@@ -197,9 +201,9 @@
const unsigned int kernel_width = weights->info()->dimension(idx_width);
const unsigned int kernel_height = weights->info()->dimension(idx_height);
+ const unsigned int num_kernels = weights->info()->dimension(idx_kernels);
const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
_is_prepared = weights_info.retain_internal_weights();
@@ -233,11 +237,12 @@
conv_info,
dilation);
- unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels) / num_groups;
+ unsigned int mat_weights_cols = num_kernels / num_groups;
const ICLTensor *biases_to_use = biases;
bool append_bias = false;
+ ICLTensor *weights_to_use = &_weights_reshaped;
if(num_groups != 1 && biases != nullptr)
{
// num_groups != 1 can only be for NCHW
@@ -245,11 +250,27 @@
biases_to_use = nullptr;
append_bias = true;
- _reshape_weights.configure(weights, biases, &_weights_reshaped, num_groups);
+ if(_weights_manager && _weights_manager->are_weights_managed(weights))
+ {
+ _reshape_weights_managed.configure(weights, biases, num_groups);
+ weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed));
+ }
+ else
+ {
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, num_groups);
+ }
}
else
{
- _reshape_weights.configure(weights, nullptr, &_weights_reshaped, num_groups);
+ if(_weights_manager && _weights_manager->are_weights_managed(weights))
+ {
+ _reshape_weights_managed.configure(weights, nullptr, num_groups);
+ weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed));
+ }
+ else
+ {
+ _reshape_weights.configure(weights, nullptr, &_weights_reshaped, num_groups);
+ }
}
// Create tensor to store im2col reshaped inputs
@@ -289,20 +310,28 @@
}
GEMMLowpOutputStageInfo gemmlowp_output_stage;
- gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- gemmlowp_output_stage.gemmlowp_offset = 0;
- gemmlowp_output_stage.gemmlowp_multiplier = 0;
- gemmlowp_output_stage.gemmlowp_shift = 0;
+ gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ gemmlowp_output_stage.gemmlowp_offset = 0;
// Configure output stage for quantized case
if(_is_quantized)
{
- const auto output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info;
+ const auto output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info;
+ const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
+ const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1;
- const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
- int output_multiplier = 0;
- int output_shift = 0;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
+
+ gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
+ gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
+ quantization::compute_quantized_multipliers_and_shifts(input->info(),
+ weights->info(),
+ output->info(),
+ idx_kernels,
+ gemmlowp_output_stage.gemmlowp_multipliers.data(),
+ gemmlowp_output_stage.gemmlowp_shifts.data());
+ gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
+ gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0];
int min_activation = 0;
int max_activation = 0;
@@ -329,18 +358,16 @@
}
// Set the GEMMLowp output stage info
- gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
- gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
- gemmlowp_output_stage.gemmlowp_shift = output_shift;
- gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
- gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
+ gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
+ gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
+ gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
}
// Configure and tune GEMM
// In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
- configure_mm(gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info);
+ configure_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info);
if(!_skip_im2col)
{
@@ -375,8 +402,17 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
+ const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
+
+ if(is_quantized_per_channel)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() != DataType::QASYMM8, "Input data type not compatible with Weights");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8");
@@ -391,6 +427,7 @@
const unsigned int kernel_width = weights->dimension(idx_width);
const unsigned int kernel_height = weights->dimension(idx_height);
+ const unsigned int num_kernels = weights->dimension(idx_kernels);
TensorInfo im2col_reshaped_info{};
TensorInfo info_gemm{};
@@ -398,15 +435,10 @@
const ITensorInfo *gemm_input_to_use = input;
const ITensorInfo *gemm_output_to_use = output;
const ITensorInfo *weights_to_use = weights;
-
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
- const bool skip_col2im = data_layout == DataLayout::NHWC;
- bool fuse_activation = true;
-
- const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
+ const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+ const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+ const bool skip_col2im = data_layout == DataLayout::NHWC;
+ bool fuse_activation = true;
ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -442,7 +474,7 @@
conv_info,
dilation);
- unsigned int mat_weights_cols = weights->dimension(idx_kernels) / num_groups;
+ unsigned int mat_weights_cols = num_kernels / num_groups;
const ITensorInfo *biases_to_use = biases;
bool append_bias = false;
@@ -493,20 +525,27 @@
}
GEMMLowpOutputStageInfo gemmlowp_output_stage;
- gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- gemmlowp_output_stage.gemmlowp_offset = 0;
- gemmlowp_output_stage.gemmlowp_multiplier = 0;
- gemmlowp_output_stage.gemmlowp_shift = 0;
+ gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ gemmlowp_output_stage.gemmlowp_offset = 0;
+ gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
if(is_quantized)
{
- const auto output_quant_info = (output->total_size() == 0) ? iq_info : oq_info;
+ const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
+ const auto output_quant_info = (output->total_size() == 0) ? iq_info : oq_info;
+ const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1;
- const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
- int output_multiplier = 0;
- int output_shift = 0;
-
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
+ gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
+ gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
+ quantization::compute_quantized_multipliers_and_shifts(input,
+ weights,
+ output,
+ idx_kernels,
+ gemmlowp_output_stage.gemmlowp_multipliers.data(),
+ gemmlowp_output_stage.gemmlowp_shifts.data());
+ gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
+ gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0];
int min_activation = 0;
int max_activation = 0;
@@ -533,11 +572,9 @@
}
// Set the GEMMLowp output stage info
- gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
- gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
- gemmlowp_output_stage.gemmlowp_shift = output_shift;
- gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
- gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
+ gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
+ gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
+ gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
}
// In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
@@ -602,11 +639,17 @@
if(!_is_prepared)
{
ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- // Run weights reshaping and mark original weights tensor as unused
- _weights_reshaped.allocator()->allocate();
- _reshape_weights.run();
- _original_weights->mark_as_unused();
+ if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+ {
+ _weights_manager->run(_original_weights, &_reshape_weights_managed);
+ }
+ else
+ {
+ // Run weights reshaping and mark original weights tensor as unused
+ _weights_reshaped.allocator()->allocate();
+ _reshape_weights.run();
+ _original_weights->mark_as_unused();
+ }
// Prepare GEMM
_is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
@@ -619,3 +662,4 @@
_is_prepared = true;
}
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 36a120e..4671be5 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -151,8 +151,8 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
}
- auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
- 0, 0, deconv_info.stride().first, deconv_info.stride().second);
+ const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
+ auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
const TensorShape deconv_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
@@ -279,7 +279,7 @@
{
const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = _gemmlowp_final.info()->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
int output_multiplier(0);
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 0286cb3..4c0a521 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -32,6 +32,7 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
namespace arm_compute
@@ -49,6 +50,7 @@
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
+ _weights_to_qasymm8(),
_mm_midgard_kernel(),
_mm_native_kernel(),
_mm_reshaped_only_rhs_kernel(),
@@ -57,18 +59,24 @@
_mtx_b_reduction_kernel(),
_offset_contribution_kernel(),
_offset_contribution_output_stage_kernel(),
+ _qasymm8_weights(),
_vector_sum_col(),
_vector_sum_row(),
_tmp_b(),
_mm_result_s32(),
+ _gemm_output_stage_multipliers(),
+ _gemm_output_stage_shifts(),
+ _matrix_a(nullptr),
_original_b(nullptr),
+ _output(nullptr),
_a_offset(0),
_b_offset(0),
_is_gemm_reshaped(true),
_is_midgard(false),
_reshape_b_only_on_first_run(false),
_is_prepared(false),
- _fuse_output_stage(false)
+ _fuse_output_stage(false),
+ _convert_to_qasymm8(false)
{
}
@@ -81,7 +89,12 @@
_original_b = b;
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
_a_offset = a->info()->quantization_info().uniform().offset;
- _b_offset = b->info()->quantization_info().uniform().offset;
+ _matrix_a = a;
+ _output = output;
+
+ _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type())
+ && is_data_type_quantized_asymmetric(a->info()->data_type());
+ _b_offset = _convert_to_qasymm8 ? -128 : b->info()->quantization_info().uniform().offset;
// Get the GPU target
const GPUTarget gpu_target = CLScheduler::get().target();
@@ -91,8 +104,6 @@
_mm_native_kernel.set_target(gpu_target);
_mm_reshaped_only_rhs_kernel.set_target(gpu_target);
- const ICLTensor *matrix_a = a;
- const ICLTensor *matrix_b = b;
GEMMRHSMatrixInfo rhs_info;
GEMMLHSMatrixInfo lhs_info;
@@ -110,6 +121,16 @@
_is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target);
_is_midgard = gpu_target == GPUTarget::MIDGARD;
+ if(_convert_to_qasymm8)
+ {
+ // Set data type for converted weights
+ TensorInfo weights_info(*b->info());
+ weights_info.set_data_type(DataType::QASYMM8);
+ _qasymm8_weights.allocator()->init(weights_info);
+ _weights_to_qasymm8.configure(b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
+ }
+
+ const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
if(_is_gemm_reshaped)
{
matrix_b = &_tmp_b;
@@ -123,7 +144,7 @@
std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
// Configure reshape RHS kernel
- _mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info);
+ _mtx_b_reshape_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
}
// Initialize matrix B reduction kernel only if _a_offset is not equal to 0
@@ -137,7 +158,7 @@
}
// Configure Matrix B reduction kernel
- _mtx_b_reduction_kernel.configure(b, &_vector_sum_col);
+ _mtx_b_reduction_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col);
}
// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -161,14 +182,14 @@
if(_is_gemm_reshaped)
{
// Configure and tune matrix multiply kernel
- _mm_reshaped_only_rhs_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
}
else
{
if(_is_midgard)
{
// Configure matrix multiply kernel
- _mm_midgard_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ _mm_midgard_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
}
else
{
@@ -176,13 +197,27 @@
std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
// Configure matrix multiply kernel
- _mm_native_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ _mm_native_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
}
}
-
// Configure offset contribution kernel
+ const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+
+ _gemm_output_stage_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+ _gemm_output_stage_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+
_offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
- _a_offset, _b_offset, gemm_info.gemmlowp_output_stage());
+ _a_offset, _b_offset, gemm_info.gemmlowp_output_stage(), &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+
+ _gemm_output_stage_multipliers.allocator()->allocate();
+ _gemm_output_stage_shifts.allocator()->allocate();
+ // Compute GEMM output multipliers and shifts for output stage
+ _gemm_output_stage_multipliers.map();
+ _gemm_output_stage_shifts.map();
+ std::memcpy(_gemm_output_stage_multipliers.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
+ std::memcpy(_gemm_output_stage_shifts.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
+ _gemm_output_stage_multipliers.unmap();
+ _gemm_output_stage_shifts.unmap();
_mm_result_s32.allocator()->allocate();
}
@@ -191,14 +226,14 @@
if(_is_gemm_reshaped)
{
// Configure and tune matrix multiply kernel
- _mm_reshaped_only_rhs_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
}
else
{
if(_is_midgard)
{
// Configure matrix multiply kernel
- _mm_midgard_kernel.configure(matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ _mm_midgard_kernel.configure(_matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
}
else
{
@@ -206,7 +241,7 @@
std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
// Configure matrix multiply kernel
- _mm_native_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ _mm_native_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
}
}
@@ -237,7 +272,15 @@
Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ if(b->data_type() == DataType::QSYMM8_PER_CHANNEL)
+ {
+ //DataType::QSYMM8_PER_CHANNEL supported only for weights
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() != DataType::QASYMM8, "Matrix A is not quantized while Matrix B is");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
@@ -245,7 +288,6 @@
int32_t b_offset = b->quantization_info().uniform().offset;
const ITensorInfo *matrix_a_info = a;
- const ITensorInfo *matrix_b_info = b;
TensorInfo tmp_b_info{};
GEMMRHSMatrixInfo rhs_info;
@@ -266,6 +308,16 @@
const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+ bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
+ && is_data_type_quantized_asymmetric(a->data_type());
+ TensorInfo weights_info(*b);
+ if(convert_to_qasymm8)
+ {
+ b_offset = -128;
+ weights_info.set_data_type(DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConvertLayerKernel::validate(b, &weights_info, ConvertPolicy::WRAP, 0));
+ }
+ const ITensorInfo *matrix_b_info = &weights_info;
if(reshape_matrix_b)
{
matrix_b_info = &tmp_b_info;
@@ -274,8 +326,8 @@
std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
// Validate reshape RHS kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+ auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
}
TensorInfo info_vector_sum_col{};
@@ -284,10 +336,10 @@
// Validate matrix B reduction kernel only if _a_offset is not equal to 0
if(a_offset != 0)
{
- info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+ info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
// Configure Matrix B reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col));
}
// Validate Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -332,13 +384,19 @@
}
// Validate offset contribution kernel
+ const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+
+ const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
a_offset == 0 ? nullptr : &info_vector_sum_col,
b_offset == 0 ? nullptr : &info_vector_sum_row,
c,
output,
a_offset, b_offset,
- gemm_info.gemmlowp_output_stage()));
+ gemm_info.gemmlowp_output_stage(),
+ &gemm_output_stage_multipliers_shifts_info,
+ &gemm_output_stage_multipliers_shifts_info));
}
else
{
@@ -438,6 +496,12 @@
{
if(!_is_prepared)
{
+ if(_convert_to_qasymm8)
+ {
+ _qasymm8_weights.allocator()->allocate();
+ CLScheduler::get().enqueue(_weights_to_qasymm8, false);
+ }
+
if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
{
ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index d712a23..c9eb8ab 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -30,26 +30,33 @@
namespace arm_compute
{
CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)),
+ : _memory_group(memory_manager),
_permute_deltas_kernel(),
_flatten_deltas_kernel(),
_permute_scores_kernel(),
_flatten_scores_kernel(),
_compute_anchors_kernel(),
_bounding_box_kernel(),
- _memset_kernel(),
- _padded_copy_kernel(),
- _cpp_nms_kernel(),
+ _pad_kernel(),
+ _dequantize_anchors(),
+ _dequantize_deltas(),
+ _quantize_all_proposals(),
+ _cpp_nms(memory_manager),
_is_nhwc(false),
+ _is_qasymm8(false),
_deltas_permuted(),
_deltas_flattened(),
+ _deltas_flattened_f32(),
_scores_permuted(),
_scores_flattened(),
_all_anchors(),
+ _all_anchors_f32(),
_all_proposals(),
+ _all_proposals_quantized(),
_keeps_nms_unused(),
_classes_nms_unused(),
_proposals_4_roi_values(),
+ _all_proposals_to_use(nullptr),
_num_valid_proposals(nullptr),
_scores_out(nullptr)
{
@@ -61,63 +68,93 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
- _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC;
- const DataType data_type = deltas->info()->data_type();
- const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
- const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
- const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
- const int total_num_anchors = num_anchors * feat_width * feat_height;
- const int pre_nms_topN = info.pre_nms_topN();
- const int post_nms_topN = info.post_nms_topN();
- const size_t values_per_roi = info.values_per_roi();
+ _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC;
+ const DataType scores_data_type = scores->info()->data_type();
+ _is_qasymm8 = scores_data_type == DataType::QASYMM8;
+ const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+ const int total_num_anchors = num_anchors * feat_width * feat_height;
+ const int pre_nms_topN = info.pre_nms_topN();
+ const int post_nms_topN = info.post_nms_topN();
+ const size_t values_per_roi = info.values_per_roi();
+
+ const QuantizationInfo scores_qinfo = scores->info()->quantization_info();
+ const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
+ const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
// Compute all the anchors
_memory_group.manage(&_all_anchors);
_compute_anchors_kernel.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
- _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, data_type));
+ _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
// Permute and reshape deltas
+ _memory_group.manage(&_deltas_flattened);
if(!_is_nhwc)
{
_memory_group.manage(&_deltas_permuted);
- _memory_group.manage(&_deltas_flattened);
_permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
_flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
_deltas_permuted.allocator()->allocate();
}
else
{
- _memory_group.manage(&_deltas_flattened);
_flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
}
const TensorShape flatten_shape_scores(1, total_num_anchors);
- _scores_flattened.allocator()->init(TensorInfo(flatten_shape_scores, 1, data_type));
+ _scores_flattened.allocator()->init(TensorInfo(flatten_shape_scores, 1, scores_data_type, scores_qinfo));
// Permute and reshape scores
+ _memory_group.manage(&_scores_flattened);
if(!_is_nhwc)
{
_memory_group.manage(&_scores_permuted);
- _memory_group.manage(&_scores_flattened);
_permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
_flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
_scores_permuted.allocator()->allocate();
}
else
{
- _memory_group.manage(&_scores_flattened);
_flatten_scores_kernel.configure(scores, &_scores_flattened);
}
+ CLTensor *anchors_to_use = &_all_anchors;
+ CLTensor *deltas_to_use = &_deltas_flattened;
+ if(_is_qasymm8)
+ {
+ _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
+ _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
+ _memory_group.manage(&_all_anchors_f32);
+ _memory_group.manage(&_deltas_flattened_f32);
+ // Dequantize anchors to float
+ _dequantize_anchors.configure(&_all_anchors, &_all_anchors_f32);
+ _all_anchors.allocator()->allocate();
+ anchors_to_use = &_all_anchors_f32;
+ // Dequantize deltas to float
+ _dequantize_deltas.configure(&_deltas_flattened, &_deltas_flattened_f32);
+ _deltas_flattened.allocator()->allocate();
+ deltas_to_use = &_deltas_flattened_f32;
+ }
// Bounding box transform
_memory_group.manage(&_all_proposals);
BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f);
- _bounding_box_kernel.configure(&_all_anchors, &_all_proposals, &_deltas_flattened, bbox_info);
- _deltas_flattened.allocator()->allocate();
- _all_anchors.allocator()->allocate();
+ _bounding_box_kernel.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
+ deltas_to_use->allocator()->allocate();
+ anchors_to_use->allocator()->allocate();
+ _all_proposals_to_use = &_all_proposals;
+ if(_is_qasymm8)
+ {
+ _memory_group.manage(&_all_proposals_quantized);
+ // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
+ _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+ _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized);
+ _all_proposals.allocator()->allocate();
+ _all_proposals_to_use = &_all_proposals_quantized;
+ }
// The original layer implementation first selects the best pre_nms_topN anchors (thus having a lightweight sort)
// that are then transformed by bbox_transform. The boxes generated are then fed into a non-sorting NMS operation.
// Since we are reusing the NMS layer and we don't implement any CL/sort, we let NMS do the sorting (of all the input)
@@ -128,12 +165,12 @@
_memory_group.manage(&_keeps_nms_unused);
// Note that NMS needs outputs preinitialized.
- auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, data_type);
- auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, data_type);
+ auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
+ auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
// Initialize temporaries (unused) outputs
- _classes_nms_unused.allocator()->init(TensorInfo(TensorShape(1, 1), 1, data_type));
+ _classes_nms_unused.allocator()->init(TensorInfo(TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo));
_keeps_nms_unused.allocator()->init(*scores_out->info());
// Save the output (to map and unmap them at run)
@@ -141,26 +178,26 @@
_num_valid_proposals = num_valid_proposals;
_memory_group.manage(&_proposals_4_roi_values);
- _cpp_nms_kernel.configure(&_scores_flattened, &_all_proposals, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
- BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()));
+ _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
+ BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()));
_keeps_nms_unused.allocator()->allocate();
_classes_nms_unused.allocator()->allocate();
- _all_proposals.allocator()->allocate();
+ _all_proposals_to_use->allocator()->allocate();
_scores_flattened.allocator()->allocate();
// Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
- _padded_copy_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+ _pad_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
_proposals_4_roi_values.allocator()->allocate();
-
- _memset_kernel.configure(proposals, PixelValue());
}
Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW, DataLayout::NHWC);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
@@ -169,8 +206,17 @@
const int total_num_anchors = num_anchors * feat_width * feat_height;
const int values_per_roi = info.values_per_roi();
+ const bool is_qasymm8 = scores->data_type() == DataType::QASYMM8;
+
ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
+ if(is_qasymm8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
+ const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
+ ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
+ }
+
TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
@@ -190,15 +236,36 @@
TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
- TensorInfo scores_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+ TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&scores_permuted_info, &scores_flattened_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, BoundingBoxTransformInfo(info.im_width(), info.im_height(),
- 1.f)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(&proposals_4_roi_values, proposals, PaddingList{ { 0, 1 } }));
- ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(proposals, PixelValue()));
+ TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
+ TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
+ if(is_qasymm8)
+ {
+ TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(&all_anchors_info, &all_anchors_f32_info));
+
+ TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+
+ TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+ BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayerKernel::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+ proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+ BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
if(num_valid_proposals->total_size() > 0)
{
@@ -212,7 +279,17 @@
ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(proposals, deltas);
+ if(is_qasymm8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
+ const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
+ ARM_COMPUTE_RETURN_ERROR_ON(proposals_qinfo.scale != 0.125f);
+ ARM_COMPUTE_RETURN_ERROR_ON(proposals_qinfo.offset != 0);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(proposals, scores);
+ }
}
if(scores_out->total_size() > 0)
@@ -229,7 +306,7 @@
{
// Map inputs
_scores_flattened.map(true);
- _all_proposals.map(true);
+ _all_proposals_to_use->map(true);
// Map outputs
_scores_out->map(CLScheduler::get().queue(), true);
@@ -239,7 +316,7 @@
_classes_nms_unused.map(true);
// Run nms
- CPPScheduler::get().schedule(&_cpp_nms_kernel, Window::DimX);
+ _cpp_nms.run();
// Unmap outputs
_keeps_nms_unused.unmap();
@@ -250,7 +327,7 @@
// Unmap inputs
_scores_flattened.unmap();
- _all_proposals.unmap();
+ _all_proposals_to_use->unmap();
}
void CLGenerateProposalsLayer::run()
@@ -270,12 +347,23 @@
CLScheduler::get().enqueue(_flatten_deltas_kernel, false);
CLScheduler::get().enqueue(_flatten_scores_kernel, false);
+ if(_is_qasymm8)
+ {
+ CLScheduler::get().enqueue(_dequantize_anchors, false);
+ CLScheduler::get().enqueue(_dequantize_deltas, false);
+ }
+
// Build the boxes
CLScheduler::get().enqueue(_bounding_box_kernel, false);
+
+ if(_is_qasymm8)
+ {
+ CLScheduler::get().enqueue(_quantize_all_proposals, false);
+ }
+
// Non maxima suppression
run_cpp_nms_kernel();
// Add dummy batch indexes
- CLScheduler::get().enqueue(_memset_kernel, true);
- CLScheduler::get().enqueue(_padded_copy_kernel, true);
+ CLScheduler::get().enqueue(_pad_kernel, true);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
new file mode 100644
index 0000000..2b0987f
--- /dev/null
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+CLInstanceNormalizationLayer::CLInstanceNormalizationLayer()
+{
+}
+
+void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernel>();
+ k->configure(input, output, gamma, beta, epsilon);
+ _kernel = std::move(k);
+}
+
+Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
+{
+ return CLInstanceNormalizationLayerKernel::validate(input, output, gamma, beta, epsilon);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index e76e4f6..7d1c818 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -38,7 +38,7 @@
{
constexpr int max_input_tensor_dim = 3;
} // namespace
-
+
CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
{
@@ -46,6 +46,9 @@
void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis, float epsilon)
{
+ // Reset auxiliary tensor
+ _sumsq.allocator()->init(TensorInfo());
+
// Manage intermediate buffers
_memory_group.manage(&_sumsq);
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index 11cf85e..e5f1278 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -159,8 +159,7 @@
const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
int output_multiplier = 0;
int output_shift = 0;
-
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
_memory_group.manage(&_output_lowp);
_output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
@@ -361,12 +360,13 @@
input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
weights_transposed.set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
- // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
const TensorInfo output_lowp(output_highp.tensor_shape(), 1, DataType::QSYMM16, qsymm_3);
- const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
- ARM_COMPUTE_UNUSED(multiplier);
- ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
+ const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+
// _output_stage
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp));
@@ -504,7 +504,7 @@
_tanh_output_state.run();
_mul_output_state_tmp_output_gate.run();
- // Requantize output state from QSYMM16 to QASYMM16
+ // Requantize output state from QSYMM16 to QASYMM8
_dequantize.run();
_quantize.run();
}
@@ -553,4 +553,4 @@
}
}
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 99e3121..8f36a69 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,183 +23,25 @@
*/
#include "arm_compute/runtime/CL/functions/CLPadLayer.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/ToolchainSupport.h"
-
namespace arm_compute
{
CLPadLayer::CLPadLayer()
- : _copy_kernel(), _mode(), _padding(), _memset_kernel(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
+ : _pad_kernel(), _copy_kernel(), _perform_pad(false)
{
}
-void CLPadLayer::configure_constant_mode(ICLTensor *input, ICLTensor *output, const PaddingList &padding, const PixelValue constant_value)
-{
- // Set the pages of the output to the constant_value.
- _memset_kernel.configure(output, constant_value);
-
- // Fill out padding list with zeroes.
- PaddingList padding_extended = padding;
- for(size_t i = padding.size(); i < TensorShape::num_max_dimensions; i++)
- {
- padding_extended.emplace_back(PaddingInfo{ 0, 0 });
- }
-
- // Create a window within the output tensor where the input will be copied.
- Window copy_window = Window();
- for(uint32_t i = 0; i < output->info()->num_dimensions(); ++i)
- {
- copy_window.set(i, Window::Dimension(padding_extended[i].first, padding_extended[i].first + input->info()->dimension(i), 1));
- }
- // Copy the input to the output, leaving the padding filled with the constant_value.
- _copy_kernel.configure(input, output, PaddingList(), ©_window);
-}
-
-void CLPadLayer::configure_reflect_symmetric_mode(ICLTensor *input, ICLTensor *output)
-{
- int64_t last_padding_dimension = _padding.size() - 1;
- // Reflecting can be performed by effectively unfolding the input as follows:
- // For each dimension starting at DimX:
- // Create a before and after slice, which values depend on the selected padding mode
- // Concatenate the before and after padding with the tensor to be padded
-
- // Two strided slice functions will be required for each dimension padded as well as a
- // concatenate function and the tensors to hold the temporary results.
- _slice_functions.resize(2 * _num_dimensions);
- _slice_results.resize(2 * _num_dimensions);
- _concat_functions.resize(_num_dimensions);
- _concat_results.resize(_num_dimensions - 1);
-
- Coordinates starts_before{};
- Coordinates ends_before{};
- Coordinates starts_after{};
- Coordinates ends_after{};
- Coordinates strides{};
- ICLTensor *prev = input;
- for(uint32_t i = 0; i < _num_dimensions; ++i)
- {
- // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
- if(i > 0)
- {
- strides.set(i - 1, 1);
- }
-
- if(_padding[i].first > 0 || _padding[i].second > 0)
- {
- // Set the starts, ends, and strides values for the current dimension.
- // Due to the bit masks passed to strided slice, the values below the current dimension in
- // starts and ends will be ignored so do not need to be modified.
- if(_mode == PaddingMode::REFLECT)
- {
- starts_before.set(i, _padding[i].first);
- ends_before.set(i, 0);
- starts_after.set(i, input->info()->dimension(i) - 2);
- ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 2);
- strides.set(i, -1);
- }
- else
- {
- starts_before.set(i, _padding[i].first - 1);
- ends_before.set(i, -1);
- starts_after.set(i, input->info()->dimension(i) - 1);
- ends_after.set(i, input->info()->dimension(i) - _padding[i].second - 1);
- strides.set(i, -1);
- }
-
- // Strided slice wraps negative indexes around to the end of the range,
- // instead this should indicate use of the full range and so the bit mask will be modified.
- const int32_t begin_mask_before = starts_before[i] < 0 ? ~0 : ~(1u << i);
- const int32_t end_mask_before = ends_before[i] < 0 ? ~0 : ~(1u << i);
- const int32_t begin_mask_after = starts_after[i] < 0 ? ~0 : ~(1u << i);
- const int32_t end_mask_after = ends_after[i] < 0 ? ~0 : ~(1u << i);
-
- // Reflect the input values for the padding before and after the input.
- std::vector<ICLTensor *> concat_vector;
- if(_padding[i].first > 0)
- {
- if(i < prev->info()->num_dimensions())
- {
- _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
- concat_vector.push_back(&_slice_results[2 * i]);
- }
- else
- {
- // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
- concat_vector.push_back(prev);
- }
- }
- concat_vector.push_back(prev);
- if(_padding[i].second > 0)
- {
- if(i < prev->info()->num_dimensions())
- {
- _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
- concat_vector.push_back(&_slice_results[2 * i + 1]);
- }
- else
- {
- // Performing the slice is unnecessary if the result would simply be a copy of the tensor.
- concat_vector.push_back(prev);
- }
- }
- // Concatenate the padding before and after with the input.
- ICLTensor *out = (static_cast<int32_t>(i) == last_padding_dimension) ? output : &_concat_results[i];
- _concat_functions[i].configure(concat_vector, out, i);
- prev = out;
- }
- }
- for(uint32_t i = 0; i < _num_dimensions; ++i)
- {
- if((static_cast<int32_t>(i) != last_padding_dimension))
- {
- _concat_results[i].allocator()->allocate();
- }
- _slice_results[2 * i].allocator()->allocate();
- _slice_results[2 * i + 1].allocator()->allocate();
- }
-}
-
void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
{
ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
- _padding = padding;
- _mode = mode;
-
- TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
-
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
-
- // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
- int64_t last_padding_dimension = _padding.size() - 1;
- for(; last_padding_dimension >= 0; --last_padding_dimension)
+ _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
{
- if(_padding[last_padding_dimension].first > 0 || _padding[last_padding_dimension].second > 0)
- {
- break;
- }
- }
- _num_dimensions = last_padding_dimension + 1;
- if(_num_dimensions > 0)
+ return info.first > 0 || info.second > 0;
+ });
+
+ if(_perform_pad)
{
- switch(_mode)
- {
- case PaddingMode::CONSTANT:
- {
- configure_constant_mode(input, output, padding, constant_value);
- break;
- }
- case PaddingMode::REFLECT:
- case PaddingMode::SYMMETRIC:
- {
- configure_reflect_symmetric_mode(input, output);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Padding mode not supported.");
- }
+ _pad_kernel.configure(input, output, padding, constant_value, mode);
}
else
{
@@ -207,111 +49,34 @@
_copy_kernel.configure(input, output);
}
}
-
Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
{
- ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions());
-
- TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
-
- // Use CLCopyKernel and CLMemsetKernel to validate all padding modes as this includes all of the shape and info validation.
- PaddingList padding_extended = padding;
- for(size_t i = padding.size(); i < TensorShape::num_max_dimensions; i++)
+ bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
{
- padding_extended.emplace_back(PaddingInfo{ 0, 0 });
- }
+ return info.first > 0 || info.second > 0;
+ });
- Window copy_window = Window();
- for(uint32_t i = 0; i < padded_shape.num_dimensions(); ++i)
+ if(perform_pad)
{
- copy_window.set(i, Window::Dimension(padding_extended[i].first, padding_extended[i].first + input->dimension(i), 1));
- }
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input);
- ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, PaddingList(), ©_window));
- ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, constant_value));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, &input->clone()->set_tensor_shape(padded_shape), PaddingList(), ©_window));
- ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(&input->clone()->set_tensor_shape(padded_shape), constant_value));
- }
-
- switch(mode)
- {
- case PaddingMode::CONSTANT:
- {
- break;
- }
- case PaddingMode::REFLECT:
- case PaddingMode::SYMMETRIC:
- {
- for(uint32_t i = 0; i < padding.size(); ++i)
- {
- if(mode == PaddingMode::REFLECT)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
- ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first > input->dimension(i));
- ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second > input->dimension(i));
- }
- }
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Invalid mode");
- }
+ Window copy_window = Window();
+ copy_window.use_tensor_dimensions(output->tensor_shape());
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, PaddingList(), ©_window));
}
return Status{};
}
-
void CLPadLayer::run()
{
- if(_num_dimensions > 0)
+ if(_perform_pad)
{
- switch(_mode)
- {
- case PaddingMode::CONSTANT:
- {
- CLScheduler::get().enqueue(_memset_kernel, false);
- CLScheduler::get().enqueue(_copy_kernel, true);
- break;
- }
- case PaddingMode::REFLECT:
- case PaddingMode::SYMMETRIC:
- {
- for(uint32_t i = 0; i < _num_dimensions; ++i)
- {
- if(_padding[i].first > 0 || _padding[i].second > 0)
- {
- if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
- {
- _slice_functions[2 * i].run();
- }
- if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
- {
- _slice_functions[2 * i + 1].run();
- }
- CLScheduler::get().sync();
- _concat_functions[i].run();
- CLScheduler::get().sync();
- }
- }
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Padding mode not supported.");
- }
+ CLScheduler::get().enqueue(_pad_kernel);
}
else
{
- CLScheduler::get().enqueue(_copy_kernel, true);
+ CLScheduler::get().enqueue(_copy_kernel);
}
}
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 38f0a75..3aa5a81 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -26,15 +26,17 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/Tensor.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int axis)
@@ -56,17 +58,52 @@
} // namespace
CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
+ : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _reshape_kernel(), _op(), _num_of_stages(), _reduction_axis(), _is_serial(),
+ _is_reshape_required(false)
{
}
-Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
{
- const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
- bool is_serial = is_data_type_quantized(input->data_type()) || axis != 0;
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+
+ const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
+ const bool is_serial = needs_serialized_reduction(op, input->data_type(), axis);
+ const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+ const bool is_reshape_required = !keep_dims || is_arg_min_max;
+
+ if(is_reshape_required)
+ {
+ const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
+ }
+
+ auto *output_internal = output;
+
+ TensorInfo output_before_reshape;
+ const auto input_shape = input->tensor_shape();
+ const auto input_data_type = input->data_type();
+ const auto input_num_channles = input->num_channels();
+ const auto input_qinfo = input->quantization_info();
+ const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type();
+
+ auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
+ {
+ ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
+ };
+
+ if(is_reshape_required)
+ {
+ auto shape_before_reshape = input_shape;
+ shape_before_reshape.set(axis, 1);
+ initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo);
+ output_internal = &output_before_reshape;
+ }
+
if(is_serial)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op));
}
else
{
@@ -74,14 +111,13 @@
std::vector<TensorInfo> sums_vector(num_of_stages - 1);
// Create intermediate tensor info
- TensorShape shape{ input->tensor_shape() };
+ TensorShape shape{ input_shape };
+
+ shape.set(0, ceil(shape.x() / 128.f));
for(unsigned int i = 0; i < num_of_stages - 1; i++)
{
- shape.set(0, ceil(shape.x() / 128.f));
- sums_vector[i].set_data_type(input->data_type());
- sums_vector[i].set_tensor_shape(shape);
- sums_vector[i].set_num_channels(input->num_channels());
+ initialize_tensorinfo(sums_vector[i], shape, input_data_type, input_num_channles, input_qinfo);
}
ReductionOperation first_kernel_op;
@@ -130,17 +166,72 @@
// Validate ReductionOperation on the last stage
const unsigned int last_stage = num_of_stages - 1;
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output, axis, last_kernel_op, input->dimension(0)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output_internal, axis, last_kernel_op, input->dimension(0)));
+ }
+
+ if(is_reshape_required)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(output_internal, output));
}
return Status{};
}
-void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output)
{
- _num_of_stages = calculate_number_of_stages(input->info(), axis);
- _reduction_axis = axis;
- _is_serial = is_data_type_quantized(input->info()->data_type()) || axis != 0;
+ if(!_is_reshape_required && _is_serial)
+ {
+ return output;
+ }
+
+ auto intermediate_result_vector_size = _is_serial ? 1 : _num_of_stages;
+ const auto is_arg_min_max = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN);
+
+ if(!_is_reshape_required)
+ {
+ --intermediate_result_vector_size;
+ }
+
+ _results_vector.resize(intermediate_result_vector_size);
+ auto shape = input->info()->tensor_shape();
+
+ shape.set(_reduction_axis, _is_serial ? 1 : ceil(shape.x() / 128.f));
+
+ for(auto &v : _results_vector)
+ {
+ if(&v == &_results_vector.back() && _is_reshape_required)
+ {
+ shape.set(_reduction_axis, 1);
+ }
+ v.allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+ }
+
+ if(is_arg_min_max)
+ {
+ _results_vector.back().info()->set_data_type(DataType::S32).set_is_resizable(true).reset_padding();
+ }
+
+ return _is_reshape_required ? &_results_vector.back() : output;
+}
+
+void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+{
+ _op = op;
+ _num_of_stages = calculate_number_of_stages(input->info(), axis);
+ _reduction_axis = axis;
+ _is_serial = needs_serialized_reduction(op, input->info()->data_type(), axis);
+ const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+ _is_reshape_required = !keep_dims || is_arg_min_max;
+
+ auto *output_internal = configure_intermediate_result_vector(input, output);
+
+ // ArgMinMax might not give initialized output tensor, so initialize here.
+ if(_is_reshape_required)
+ {
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+ const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+ }
// Configure reduction operation kernels
_reduction_kernels_vector.resize(_num_of_stages);
@@ -148,20 +239,16 @@
// Create temporary tensors
if(_is_serial)
{
- _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+ if(_is_reshape_required)
+ {
+ _memory_group.manage(&_results_vector.back());
+ }
+
+ _reduction_kernels_vector[0].configure(input, output_internal, axis, op, 0);
}
else
{
_border_handlers_vector.resize(_num_of_stages);
- _results_vector.resize(_num_of_stages - 1);
- TensorShape shape{ input->info()->tensor_shape() };
- for(unsigned int i = 0; i < _num_of_stages - 1; i++)
- {
- shape.set(0, ceil(shape.x() / 128.f));
- _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
- }
-
- // Apply ReductionOperation only on first kernel
_memory_group.manage(&_results_vector[0]);
ReductionOperation first_kernel_op;
@@ -262,10 +349,22 @@
// Apply ReductionOperation on the last stage
const unsigned int last_stage = _num_of_stages - 1;
const unsigned int input_width = input->info()->dimension(0);
- _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output, axis, last_kernel_op, input_width);
+
+ if(_is_reshape_required)
+ {
+ _memory_group.manage(&_results_vector.back());
+ }
+
+ _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width);
_border_handlers_vector[last_stage].configure(&_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
_results_vector[last_stage - 1].allocator()->allocate();
}
+
+ if(_is_reshape_required)
+ {
+ _reshape_kernel.configure(&_results_vector.back(), output);
+ _results_vector.back().allocator()->allocate();
+ }
}
void CLReductionOperation::run()
@@ -284,4 +383,10 @@
CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
}
}
+
+ if(_is_reshape_required)
+ {
+ CLScheduler::get().enqueue(_reshape_kernel, false);
+ }
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 7e41dba..32d7f44 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -30,18 +30,19 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
namespace arm_compute
{
-CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+template <bool IS_LOG>
+CLSoftmaxLayerGeneric<IS_LOG>::CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_kernel_ptr(), _reshape_kernel(), _max(), _sum(), _tmp(), _input_flattened(), _output_flattened(),
_needs_flattening(false)
{
}
-void CLSoftmaxLayer::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis)
+template <bool IS_LOG>
+void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis)
{
// Flatten the input
const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
@@ -70,11 +71,12 @@
auto_init_if_empty(*output->info(), *input->info()->clone());
}
-void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
+template <bool IS_LOG>
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
{
// Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayer::validate(input->info(), output->info(), beta, axis));
+ ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayerGeneric<IS_LOG>::validate(input->info(), output->info(), beta, axis));
// We don't need flattening only in the case the input is 2D and axis is 1
_needs_flattening = axis != 1;
@@ -115,8 +117,12 @@
_memory_group.manage(&_max);
_memory_group.manage(&_sum);
+ SoftmaxKernelInfo softmax_info;
+ softmax_info.beta = beta;
+ softmax_info.is_log = IS_LOG;
+
// Configure kernels
- _max_shift_exp_sum_kernel.configure(input_2D, &_max, &_tmp, &_sum, beta);
+ _max_shift_exp_sum_kernel.configure(input_2D, &_max, &_tmp, &_sum, softmax_info);
if(_needs_flattening)
{
@@ -124,7 +130,7 @@
_memory_group.manage(&_output_flattened);
// The normalization kernel stores the result in a flat output tensor
- _norm_kernel.configure(&_tmp, &_sum, &_output_flattened, beta);
+ _norm_kernel.configure(&_tmp, &_sum, &_output_flattened, softmax_info);
// Reshape the flat output into a the requested (4D) output
_reshape_kernel.configure(&_output_flattened, output);
@@ -136,7 +142,7 @@
else
{
// Softmax 2D case
- _norm_kernel.configure(&_tmp, &_sum, output, beta);
+ _norm_kernel.configure(&_tmp, &_sum, output, softmax_info);
}
// Allocate intermediate buffers
@@ -145,7 +151,8 @@
_sum.allocator()->allocate();
}
-Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
+template <bool IS_LOG>
+Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
@@ -189,7 +196,8 @@
return Status{};
}
-void CLSoftmaxLayer::run()
+template <bool IS_LOG>
+void CLSoftmaxLayerGeneric<IS_LOG>::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
@@ -207,4 +215,7 @@
}
}
+template class CLSoftmaxLayerGeneric<false>;
+template class CLSoftmaxLayerGeneric<true>;
+
} // namespace arm_compute