arm_compute v19.08
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index b8224d2..5d224db 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -23,11 +23,14 @@
*/
#include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
+#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
-#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Error.h"
@@ -44,14 +47,35 @@
{
}
-void CLConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+void CLConcatenateLayer::configure(std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+{
+ configure_internal(std::move(inputs_vector), output, axis);
+}
+
+void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+{
+ configure_internal(std::move(inputs_vector), output, axis);
+}
+
+Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+{
+ return validate_internal(inputs_vector, output, axis);
+}
+
+Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+{
+ return validate_internal(inputs_vector, output, axis);
+}
+
+template <typename TensorType>
+void CLConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ICLTensor *output, size_t axis)
{
ARM_COMPUTE_ERROR_ON(output == nullptr);
_axis = axis;
_num_inputs = inputs_vector.size();
std::vector<ITensorInfo *> inputs_vector_info(inputs_vector.size());
- std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](ICLTensor * t)
+ std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](TensorType * t)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(t);
return t->info();
@@ -122,12 +146,24 @@
}
break;
}
+ case 3:
+ {
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ auto kernel = support::cpp14::make_unique<CLBatchConcatenateLayerKernel>();
+ kernel->configure(inputs_vector.at(i), offset, output);
+ offset += inputs_vector.at(i)->info()->dimension(_axis);
+ _concat_kernels.emplace_back(std::move(kernel));
+ }
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Axis not supported");
}
}
-Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+template <typename TensorInfoType>
+Status CLConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis)
{
ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
const unsigned int num_inputs = inputs_vector.size();
@@ -182,6 +218,15 @@
}
break;
}
+ case 3:
+ {
+ for(const auto &input : inputs_vector)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLBatchConcatenateLayerKernel::validate(input, offset, output));
+ offset += input->dimension(axis);
+ }
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Axis not supported");
}
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 165d523..d794cde 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -188,11 +188,17 @@
}
else
{
- if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && ( CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
+ // SRGAN
+ if((input->dimension(idx_h) > 720U) && (output->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
+ && (CLDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
+ {
+ return ConvolutionMethod::DIRECT;
+ }
+ if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
{
return ConvolutionMethod::FFT;
}
- if (input->dimension(idx_c) < 16)
+ if(input->dimension(idx_c) < 16)
{
return ConvolutionMethod::GEMM;
}
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index c6f79d3..7aa7714 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -42,10 +42,9 @@
}
void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
- unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_UNUSED(inner_border_right, inner_border_top);
switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info))
{
@@ -70,11 +69,9 @@
}
Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
- unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_UNUSED(inner_border_right, inner_border_top);
-
switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
{
case DeconvolutionMethod::DIRECT:
@@ -115,18 +112,6 @@
return DeconvolutionMethod::GEMM;
}
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
- const WeightsInfo &weights_info)
-{
- configure(input, weights, bias, output, deconv_info, 0, 0, weights_info);
-}
-
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
- const WeightsInfo &weights_info)
-{
- return CLDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, 0, 0, weights_info);
-}
-
void CLDeconvolutionLayer::run()
{
prepare();
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index c66dff0..63a45aa 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -36,20 +36,18 @@
{
}
-Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const BorderSize &inner_border,
- const PadStrideInfo &info)
+Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
{
- return CLDeconvolutionLayerUpsampleKernel::validate(input, output, inner_border, info);
+ return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info);
}
-void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
- const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
_output = output;
_memset.configure(_output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
- _upsample.configure(input, _output, inner_border, info);
+ _upsample.configure(input, _output, info);
}
void CLDeconvolutionLayerUpsample::run()
@@ -57,4 +55,4 @@
CLScheduler::get().enqueue(_memset, false);
CLScheduler::get().enqueue(_upsample, true);
}
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
deleted file mode 100644
index f687e54..0000000
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLDepthConcatenateLayer::CLDepthConcatenateLayer() // NOLINT
- : _concat_kernels_vector(),
- _border_handlers_vector(),
- _num_inputs(0)
-{
-}
-
-void CLDepthConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output) // NOLINT
-{
- _num_inputs = inputs_vector.size();
-
- std::vector<ITensorInfo *> inputs_vector_info;
- for(unsigned int i = 0; i < _num_inputs; i++)
- {
- inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
- }
-
- _concat_kernels_vector.resize(_num_inputs);
- _border_handlers_vector.resize(_num_inputs);
-
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector_info, Window::DimZ);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
- ARM_COMPUTE_ERROR_THROW_ON(CLDepthConcatenateLayer::validate(inputs_vector_info, output->info()));
-
- unsigned int depth_offset = 0;
- for(unsigned int i = 0; i < _num_inputs; i++)
- {
- _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
- _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
-
- depth_offset += inputs_vector.at(i)->info()->dimension(2);
- }
-
- // Set valid region from shape
- output->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
-}
-
-Status CLDepthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
-
- // Output auto inizialitation if not yet initialized
- TensorInfo tmp_output_info = *output->clone();
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimZ);
- auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
-
- unsigned int depth_offset = 0;
- for(const auto &input : inputs_vector)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayerKernel::validate(input, depth_offset, &tmp_output_info));
- depth_offset += input->dimension(2);
- }
-
- return Status{};
-}
-
-void CLDepthConcatenateLayer::run()
-{
- cl::CommandQueue q = CLScheduler::get().queue();
-
- for(unsigned i = 0; i < _num_inputs; i++)
- {
- CLScheduler::get().enqueue(_border_handlers_vector[i], false);
- CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
- }
-}
diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
new file mode 100644
index 0000000..08aef92
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDepthToSpaceLayer::CLDepthToSpaceLayer()
+ : _depth_to_space_kernel()
+{
+}
+
+void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+{
+ _depth_to_space_kernel.configure(input, output, block_shape);
+}
+
+Status CLDepthToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
+{
+ return CLDepthToSpaceLayerKernel::validate(input, output, block_shape);
+}
+
+void CLDepthToSpaceLayer::run()
+{
+ CLScheduler::get().enqueue(_depth_to_space_kernel, true);
+}
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 97b0a01..f01b58a 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -90,6 +90,7 @@
// Configure the function to transform the weights tensor from HWI -> IHW
_permute_weights_to_nchw.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
_permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+ _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
input_to_use = &_permuted_input;
weights_to_use = &_permuted_weights;
@@ -130,7 +131,7 @@
PixelValue &&zero_value(0.f);
if(is_data_type_quantized_asymmetric(input->info()->data_type()))
{
- zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
+ zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset));
}
_border_handler.configure(input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
}
@@ -141,9 +142,10 @@
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
const bool needs_permute = is_nhwc && (depth_multiplier > 1);
- const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1);
+ const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1) && is_quantized;
const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
@@ -151,6 +153,17 @@
info.c0 = 4;
info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
+ if(is_quantized)
+ {
+ const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = (output->total_size() == 0) ? iq_info : output->quantization_info().uniform();
+
+ const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+ ARM_COMPUTE_UNUSED(multiplier);
+ ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
+ }
+
if(needs_permute)
{
TensorShape permuted_input_shape = input->tensor_shape();
@@ -176,7 +189,10 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
act_info, dilation));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
+ }
}
else
{
@@ -288,6 +304,10 @@
const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
const size_t conv_size = conv_w * conv_h;
+ const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
+
// Im2Col configuration
TensorShape shape_im2col = input->info()->tensor_shape();
shape_im2col.set(0, patch_size);
@@ -319,11 +339,11 @@
// Output staged configuration
if(_is_quantized)
{
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+ const UniformQuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info;
- float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier;
- int output_shift;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ const float multiplier = iq_info.scale * wq_info.scale / output_quant_info.scale;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
_output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
_output_reshaped.allocator()->allocate();
@@ -334,8 +354,8 @@
PixelValue zero_w(static_cast<int32_t>(0));
if(_is_quantized)
{
- zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
- zero_w = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
+ zero_in = PixelValue(static_cast<int32_t>(iq_info.offset));
+ zero_w = PixelValue(static_cast<int32_t>(wq_info.offset));
}
BorderSize border_size = _v2mm_kernel.border_size();
_v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
@@ -368,7 +388,7 @@
const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
- if(can_run_optimised_3x3_kernel)
+ if(!can_run_optimised_3x3_kernel)
{
const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
@@ -410,6 +430,13 @@
if(is_quantized)
{
+ const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = (output->total_size() == 0) ? iq_info : output->quantization_info().uniform();
+
+ const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+ ARM_COMPUTE_UNUSED(multiplier);
+ ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
}
@@ -421,7 +448,7 @@
}
else
{
- CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation));
}
return Status{};
}
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index c451bd4..bfc6ff1 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,7 +49,7 @@
PixelValue &&zero_value(0.f);
if(is_data_type_quantized_asymmetric(input->info()->data_type()))
{
- zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
+ zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset));
}
_input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value);
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index 6e14e26..c1a39ef 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -55,7 +55,6 @@
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-
const DataLayout data_layout = input->data_layout();
const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -95,11 +94,11 @@
unsigned int padx = 0;
unsigned int pady = 0;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, 0, 0, out_dims, padx, pady);
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, padx, pady);
TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
- ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(), info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
return Status{};
@@ -141,7 +140,7 @@
// Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
unsigned int padx = 0;
unsigned int pady = 0;
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, 0, 0, out_dims, padx, pady);
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, padx, pady);
TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
scale_out_info.set_data_layout(data_layout);
@@ -149,7 +148,7 @@
// configure scale function
const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
- _scale_f.configure(input, &_scaled_output, BorderSize(), upsample_info);
+ _scale_f.configure(input, &_scaled_output, upsample_info);
// Setup the function to convolve the upscaled output
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
index b7e9a68..b9ebf69 100644
--- a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -51,4 +51,59 @@
{
return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::EXP);
}
+
+void CLNegLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+ k->configure(input, output, ElementWiseUnary::NEG);
+ _kernel = std::move(k);
+}
+Status CLNegLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::NEG);
+}
+
+void CLSinLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+ k->configure(input, output, ElementWiseUnary::SIN);
+ _kernel = std::move(k);
+}
+Status CLSinLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::SIN);
+}
+
+void CLAbsLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+ k->configure(input, output, ElementWiseUnary::ABS);
+ _kernel = std::move(k);
+}
+Status CLAbsLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::ABS);
+}
+void CLLogLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+ k->configure(input, output, ElementWiseUnary::LOG);
+ _kernel = std::move(k);
+}
+Status CLLogLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::LOG);
+}
+
+void CLRoundLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+ k->configure(input, output, ElementWiseUnary::ROUND);
+ _kernel = std::move(k);
+}
+Status CLRoundLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::ROUND);
+}
+
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 28f4b13..8317e0d 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,10 +21,11 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
#include "support/ToolchainSupport.h"
-#include <arm_compute/runtime/CL/functions/CLElementwiseOperations.h>
#include <utility>
@@ -124,4 +125,18 @@
{
return CLArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
}
+
+void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::POWER, input1, input2, output);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::POWER, input1, input2, output);
+}
+
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 7b9229c..c5da649 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -41,10 +41,13 @@
{
if(is_data_type_quantized_asymmetric(input.data_type()))
{
+ const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
- const QuantizationInfo input_quantization_info(input.quantization_info().scale, -input.quantization_info().offset);
- const QuantizationInfo weights_quantization_info(weights.quantization_info().scale, -weights.quantization_info().offset);
+ const QuantizationInfo input_quantization_info(iq_info.scale, -iq_info.offset);
+ const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
// Validate gemmlowp function
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info),
@@ -88,8 +91,8 @@
const QuantizationInfo input_quantization_info = input->info()->quantization_info();
const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
- input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
- weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+ input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+ weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
// Configure gemmlowp function
_mm_gemmlowp.configure(input, weights, nullptr, output);
@@ -230,11 +233,15 @@
// Configure output stage for asymmetric quantized types
if(_is_quantized)
{
- float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
+ const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
+
+ float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
int output_multiplier;
int output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
+ _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, oq_info.offset);
_gemmlowp_output.allocator()->allocate();
}
}
@@ -324,6 +331,13 @@
// Validate output stage for asymmetric quantized types
if(is_quantized)
{
+ const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
+ const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+
+ ARM_COMPUTE_UNUSED(multiplier);
+ ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&gemmlowp_output, biases, output));
}
diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
index 32e4678..72dd27e 100644
--- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,20 +36,20 @@
{
}
-void CLFuseBatchNormalization::configure(const ICLTensor *conv_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
+void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
ICLTensor *fused_weights, ICLTensor *fused_bias,
- const ICLTensor *conv_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
- float epsilon)
+ const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
+ float epsilon, FuseBatchNormalizationType fbn_type)
{
- _fuse_bn_kernel.configure(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+ _fuse_bn_kernel.configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
}
-Status CLFuseBatchNormalization::validate(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon)
+ const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+ float epsilon, FuseBatchNormalizationType fbn_type)
{
- return CLFuseBatchNormalizationKernel::validate(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+ return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
}
void CLFuseBatchNormalization::run()
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 492709f..e78395f 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -30,10 +30,12 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/GPUTarget.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/float_ops.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/ITensorAllocator.h"
@@ -46,7 +48,6 @@
CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
_mm_kernel(),
- _ma_kernel(),
_reshape_lhs_kernel(),
_reshape_rhs_kernel(),
_mm_reshaped_kernel(),
@@ -54,7 +55,6 @@
_tmp_a(),
_tmp_b(),
_original_b(nullptr),
- _run_addition(false),
_reshape_b_only_on_first_run(false),
_is_prepared(false),
_gemm_type(GEMMType::NATIVE)
@@ -116,10 +116,10 @@
// Set the target for the kernels
_mm_kernel.set_target(gpu_target);
- GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d());
+ GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias());
// Configure and tune matrix multiply kernel
- _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision());
+ _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
// Tune kernel statically
CLScheduler::get().tune_kernel_static(_mm_kernel);
@@ -160,7 +160,7 @@
lhs_info.interleave = true;
lhs_info.transpose = true;
- GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
+ GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
_memory_group.manage(&_tmp_a);
if(!_reshape_b_only_on_first_run)
@@ -175,7 +175,7 @@
_reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
// Configure and tune matrix multiply kernel
- _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision());
+ _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
CLScheduler::get().tune_kernel_static(_mm_kernel);
@@ -189,10 +189,6 @@
void CLGEMM::configure_reshaped_v2(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_ERROR_ON(c != nullptr);
- ARM_COMPUTE_UNUSED(beta);
- ARM_COMPUTE_UNUSED(c);
-
DataType data_type = a->info()->data_type();
bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
@@ -201,13 +197,21 @@
const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
const GPUTarget gpu_target = CLScheduler::get().target();
+ bool broadcast_bias = gemm_info.broadcast_bias();
+
+ GEMMKernelInfo kernel_info;
+ kernel_info.m = m;
+ kernel_info.n = n;
+ kernel_info.k = k;
+ kernel_info.depth_output_gemm3d = depth_output_gemm3d;
+ kernel_info.reinterpret_input_as_3d = false;
+ kernel_info.broadcast_bias = broadcast_bias;
+ kernel_info.activation_info = gemm_info.activation_info();
// Set the target for the kernels
_reshape_lhs_kernel.set_target(gpu_target);
_mm_kernel.set_target(gpu_target);
- GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, false);
-
// Manage intermediate buffers
_memory_group.manage(&_tmp_a);
if(!_reshape_b_only_on_first_run)
@@ -230,7 +234,7 @@
_reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
// Configure and tune matrix multiply kernel
- _mm_reshaped_kernel.configure(&_tmp_a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
+ _mm_reshaped_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
// Allocate intermediate tensors
_tmp_a.allocator()->allocate();
@@ -242,10 +246,6 @@
void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_ERROR_ON(c != nullptr);
- ARM_COMPUTE_UNUSED(beta);
- ARM_COMPUTE_UNUSED(c);
-
DataType data_type = a->info()->data_type();
bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
@@ -254,12 +254,20 @@
const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
const GPUTarget gpu_target = CLScheduler::get().target();
+ bool broadcast_bias = gemm_info.broadcast_bias();
+
+ GEMMKernelInfo kernel_info;
+ kernel_info.m = m;
+ kernel_info.n = n;
+ kernel_info.k = k;
+ kernel_info.depth_output_gemm3d = depth_output_gemm3d;
+ kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+ kernel_info.broadcast_bias = broadcast_bias;
+ kernel_info.activation_info = gemm_info.activation_info();
// Set the target for the kernels
_mm_kernel.set_target(gpu_target);
- GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
-
// Manage intermediate buffers
if(!_reshape_b_only_on_first_run)
{
@@ -279,7 +287,7 @@
_reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
// Configure and tune matrix multiply kernel
- _mm_reshaped_only_rhs_kernel.configure(a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
+ _mm_reshaped_only_rhs_kernel.configure(a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
if(!_reshape_b_only_on_first_run)
{
@@ -299,21 +307,12 @@
const unsigned int n = b->dimension(0);
const unsigned int k = a->dimension(0);
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- const bool add_c = (beta != 0.f && c != nullptr);
- const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
- const bool fuse_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, gemm_info.broadcast_bias());
// Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
- false, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
-
- if(add_c && !fuse_add)
- {
- // Validate matrix addition kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, c, output, alpha, beta,
+ false, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));
return Status{};
}
@@ -334,9 +333,6 @@
int mult_transpose1xW_width = 1;
int mult_interleave4x4_height = 1;
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- const bool add_c = (beta != 0.f && c != nullptr);
- const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
- const bool fuse_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
{
@@ -358,7 +354,7 @@
lhs_info.interleave = true;
lhs_info.transpose = true;
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
// Validate interleave kernel
auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
@@ -369,14 +365,8 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
// Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
- true, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
-
- if(add_c && !fuse_add)
- {
- // Validate matrix addition kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta,
+ true, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));
return Status{};
}
@@ -398,9 +388,16 @@
const unsigned int k = a->dimension(0);
const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- const bool add_c = (beta != 0.f && c != nullptr);
+ const bool broadcast_bias = gemm_info.broadcast_bias();
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, false);
+ GEMMKernelInfo kernel_info;
+ kernel_info.m = m;
+ kernel_info.n = n;
+ kernel_info.k = k;
+ kernel_info.depth_output_gemm3d = depth_output_gemm3d;
+ kernel_info.reinterpret_input_as_3d = false;
+ kernel_info.broadcast_bias = broadcast_bias;
+ kernel_info.activation_info = gemm_info.activation_info();
GEMMLHSMatrixInfo lhs_info;
GEMMRHSMatrixInfo rhs_info;
@@ -419,13 +416,7 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
// Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
-
- if(add_c)
- {
- // Validate matrix addition kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
return Status{};
}
@@ -446,9 +437,16 @@
const unsigned int k = a->dimension(0);
const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- const bool add_c = (beta != 0.f && c != nullptr);
+ const bool broadcast_bias = gemm_info.broadcast_bias();
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+ GEMMKernelInfo kernel_info;
+ kernel_info.m = m;
+ kernel_info.n = n;
+ kernel_info.k = k;
+ kernel_info.depth_output_gemm3d = depth_output_gemm3d;
+ kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+ kernel_info.broadcast_bias = broadcast_bias;
+ kernel_info.activation_info = gemm_info.activation_info();
GEMMLHSMatrixInfo lhs_info;
GEMMRHSMatrixInfo rhs_info;
@@ -464,13 +462,7 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
// Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
-
- if(add_c)
- {
- // Validate matrix addition kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
return Status{};
}
@@ -497,31 +489,30 @@
// Select GEMMType
_gemm_type = select_gemm_type(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
- const bool is_gemm_v2 = (_gemm_type == GEMMType::RESHAPED_V2) || (_gemm_type == GEMMType::RESHAPED_ONLY_RHS);
- const bool add_c = (beta != 0.f && c != nullptr);
- const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
- const bool fuse_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !is_gemm_v2;
+ const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
+
+ const ICLTensor *c_to_use = fuse_add_c ? c : nullptr;
switch(_gemm_type)
{
case GEMMType::NATIVE:
{
- configure_native(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+ configure_native(a, b, c_to_use, output, alpha, beta, gemm_info);
break;
}
case GEMMType::RESHAPED_V1:
{
- configure_reshaped_v1(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+ configure_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info);
break;
}
case GEMMType::RESHAPED_V2:
{
- configure_reshaped_v2(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+ configure_reshaped_v2(a, b, c_to_use, output, alpha, beta, gemm_info);
break;
}
case GEMMType::RESHAPED_ONLY_RHS:
{
- configure_reshaped_only_rhs(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+ configure_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info);
break;
}
default:
@@ -529,13 +520,6 @@
ARM_COMPUTE_ERROR("GEMMType not supported");
}
}
-
- // Configure matrix addition kernel
- if(add_c && !fuse_add)
- {
- _ma_kernel.configure(c, output, beta);
- _run_addition = true;
- }
}
Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
@@ -550,26 +534,30 @@
// Select GEMMType
GEMMType gemm_type = select_gemm_type(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run(), gpu_target);
+ const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
+
+ const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
+
switch(gemm_type)
{
case GEMMType::NATIVE:
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c, output, alpha, beta, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c_to_use, output, alpha, beta, gemm_info));
break;
}
case GEMMType::RESHAPED_V1:
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c, output, alpha, beta, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
break;
}
case GEMMType::RESHAPED_V2:
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v2(a, b, c, output, alpha, beta, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v2(a, b, c_to_use, output, alpha, beta, gemm_info));
break;
}
case GEMMType::RESHAPED_ONLY_RHS:
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c, output, alpha, beta, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info));
break;
}
default:
@@ -592,7 +580,7 @@
{
case GEMMType::NATIVE:
{
- CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+ CLScheduler::get().enqueue(_mm_kernel, true);
break;
}
case GEMMType::RESHAPED_V1:
@@ -606,7 +594,7 @@
CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
}
- CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+ CLScheduler::get().enqueue(_mm_kernel, true);
break;
}
case GEMMType::RESHAPED_V2:
@@ -620,7 +608,7 @@
CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
}
- CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
+ CLScheduler::get().enqueue(_mm_reshaped_kernel, true);
break;
}
case GEMMType::RESHAPED_ONLY_RHS:
@@ -631,7 +619,7 @@
CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
}
- CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, !_run_addition);
+ CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, true);
break;
}
default:
@@ -639,12 +627,6 @@
ARM_COMPUTE_ERROR("GEMMType not supported");
}
}
-
- // Run matrix addition kernel
- if(_run_addition)
- {
- CLScheduler::get().enqueue(_ma_kernel);
- }
}
void CLGEMM::prepare()
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 03d516f..be6be04 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -91,22 +91,27 @@
}
CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
- _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false), _skip_col2im(false), _is_quantized(false),
- _is_activationlayer_enabled(false), _is_prepared(false), _run_addition(true)
+ : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(),
+ _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false)
{
}
void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
- int gemm_3d_depth)
+ int gemm_3d_depth, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
- ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col,
- _run_addition));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
- const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, gemmlowp_output_stage);
+ const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
+ false, // is_b_reshaped
+ true, // reshape_b_only_on_first_run
+ gemm_3d_depth, // depth_output_gemm3d
+ _skip_im2col, // reinterpret_input_as_3d
+ false, // retain_internal_weights
+ gemmlowp_output_stage, // gemmlowp_output_stage
+ false, // fp_mixed_precision
+ true, // broadcast_bias
+ act_info); // activation_info
if(_is_quantized)
{
@@ -115,8 +120,8 @@
const QuantizationInfo input_quantization_info = input->info()->quantization_info();
const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
- input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
- weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+ input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+ weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
_mm_gemmlowp.configure(input, weights, biases, output, gemm_info);
@@ -126,21 +131,26 @@
}
else
{
- // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
- const bool skip_bias_in_gemm = _run_addition || !_skip_im2col;
// Configure matrix multiply function
- _mm_gemm.configure(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
+ _mm_gemm.configure(input, weights, biases, output, 1.0f, 1.0f, gemm_info);
}
}
Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, bool run_addition)
+ const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info)
{
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, gemmlowp_output_stage);
+ const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
+ false, // is_b_reshaped
+ true, // reshape_b_only_on_first_run
+ gemm_3d_depth, // depth_output_gemm3d
+ skip_im2col, // reinterpret_input_as_3d
+ false, // retain_internal_weights
+ gemmlowp_output_stage, // gemmlowp_output_stage
+ false, // fp_mixed_precision
+ true, // broadcast_bias
+ act_info); // activation_info
if(is_quantized)
{
@@ -151,18 +161,16 @@
std::unique_ptr<ITensorInfo> input_qa = input->clone();
std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
- input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
- weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+ input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+ weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
// Perform validation step on GEMMLowp
return CLGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, gemm_info);
}
else
{
- // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
- const bool skip_bias_in_gemm = run_addition || !skip_im2col;
// Perform validation step on Matrix multiply function
- return CLGEMM::validate(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
+ return CLGEMM::validate(input, weights, biases, output, 1.0f, 1.0f, gemm_info);
}
}
@@ -190,16 +198,18 @@
const unsigned int kernel_width = weights->info()->dimension(idx_width);
const unsigned int kernel_height = weights->info()->dimension(idx_height);
- _is_prepared = weights_info.retain_internal_weights();
- _original_weights = weights;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _data_layout = data_layout;
- _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
- _skip_col2im = data_layout == DataLayout::NHWC;
- _append_bias = (biases != nullptr) && (!_is_quantized);
- _is_activationlayer_enabled = act_info.enabled();
- // In case of F16, fused bias will be used in GEMM
- _run_addition = (_skip_im2col) && (_append_bias) && (data_type != DataType::F16);
+ const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
+
+ _is_prepared = weights_info.retain_internal_weights();
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+ _skip_col2im = data_layout == DataLayout::NHWC;
+
+ // Only for quantize there are few cases where we cannot fuse the activation function in GEMM
+ _fuse_activation = true;
// Set the GPU target for im2col and col2im
_im2col_kernel.set_target(CLScheduler::get().target());
@@ -208,8 +218,6 @@
const ICLTensor *gemm_input_to_use = input;
ICLTensor *gemm_output_to_use = output;
- const ICLTensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr;
-
// Get parameters from conv_info
unsigned int stride_x = 0;
unsigned int stride_y = 0;
@@ -227,9 +235,22 @@
unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels) / num_groups;
- // _weights_reshaped will be auto configured in the kernel.
- // Just append biases and do not transpose 1xW as it will be reshaped in CLGEMM
- _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, num_groups);
+ const ICLTensor *biases_to_use = biases;
+ bool append_bias = false;
+
+ if(num_groups != 1 && biases != nullptr)
+ {
+ // num_groups != 1 can only be for NCHW
+ // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
+ biases_to_use = nullptr;
+ append_bias = true;
+
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, num_groups);
+ }
+ else
+ {
+ _reshape_weights.configure(weights, nullptr, &_weights_reshaped, num_groups);
+ }
// Create tensor to store im2col reshaped inputs
if(!_skip_im2col)
@@ -237,7 +258,7 @@
_memory_group.manage(&_im2col_output);
// Configure and tune im2col. im2col output shape is auto-initialized
- _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation, num_groups);
+ _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups);
// Set quantization info
_im2col_output.info()->set_quantization_info(input->info()->quantization_info());
@@ -246,11 +267,6 @@
// Update GEMM input
gemm_input_to_use = &_im2col_output;
}
- else if(_append_bias)
- {
- // Configure add bias kernel
- _add_bias_kernel.configure(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE);
- }
// Create GEMM output tensor
if(!_skip_col2im)
@@ -281,9 +297,9 @@
// Configure output stage for quantized case
if(_is_quantized)
{
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+ const auto output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info;
- const float multiplier = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
+ const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
int output_multiplier = 0;
int output_shift = 0;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
@@ -296,16 +312,20 @@
ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
};
- if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+ if(act_info.enabled())
{
- const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
- const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+ if(supported_acts.count(act_info.activation()) != 0)
+ {
+ const int a_const_int = quantize_qasymm8(act_info.a(), output_quant_info);
+ const int b_const_int = quantize_qasymm8(act_info.b(), output_quant_info);
- min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
- max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
-
- // If the activation layer is RELU, BOUNDED_RELU or LU_BOUNDED_RELU, we can use the GEMMLowp output stage to perform this operation
- _is_activationlayer_enabled = false;
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+ max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+ }
+ else
+ {
+ _fuse_activation = false;
+ }
}
// Set the GEMMLowp output stage info
@@ -320,7 +340,7 @@
// In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
- configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth);
+ configure_mm(gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info);
if(!_skip_im2col)
{
@@ -342,7 +362,7 @@
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
"Output shape does not match the expected one");
- if(_is_activationlayer_enabled)
+ if(!_fuse_activation)
{
_activationlayer_function.configure(output, nullptr, act_info);
}
@@ -379,13 +399,14 @@
const ITensorInfo *gemm_output_to_use = output;
const ITensorInfo *weights_to_use = weights;
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const bool append_bias = (biases != nullptr) && (!is_quantized);
- const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
- const bool skip_col2im = data_layout == DataLayout::NHWC;
- bool is_activationlayer_enabled = act_info.enabled();
- // In case of F16, fused bias will be used in GEMM
- const bool run_addition = (skip_im2col) && (append_bias) && (data_type != DataType::F16);
+ const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+ const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+ const bool skip_col2im = data_layout == DataLayout::NHWC;
+ bool fuse_activation = true;
+
+ const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -423,10 +444,26 @@
unsigned int mat_weights_cols = weights->dimension(idx_kernels) / num_groups;
- // Output tensor auto inizialitation if not yet initialized
- ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr, num_groups));
- weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, (append_bias && !skip_im2col), num_groups), 1, data_type);
- weights_to_use = &weights_reshaped_info;
+ const ITensorInfo *biases_to_use = biases;
+ bool append_bias = false;
+
+ if(num_groups != 1 && biases != nullptr)
+ {
+ // num_groups != 1 can only be for NCHW
+ // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
+ biases_to_use = nullptr;
+ append_bias = true;
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, biases, nullptr, num_groups));
+ weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, num_groups), 1, data_type);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr, num_groups));
+ weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, num_groups), 1, data_type);
+ }
+
+ weights_to_use = &weights_reshaped_info;
if(!skip_im2col)
{
@@ -440,11 +477,6 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups));
gemm_input_to_use = &im2col_reshaped_info;
}
- else if(run_addition)
- {
- // Validate add bias kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE));
- }
// Create GEMM output tensor
if(!skip_col2im)
@@ -468,9 +500,9 @@
if(is_quantized)
{
- const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input->quantization_info() : output->quantization_info();
+ const auto output_quant_info = (output->total_size() == 0) ? iq_info : oq_info;
- const float multiplier = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
+ const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
int output_multiplier = 0;
int output_shift = 0;
@@ -484,16 +516,20 @@
ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
};
- if(is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+ if(act_info.enabled())
{
- const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
- const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+ if(supported_acts.count(act_info.activation()) != 0)
+ {
+ const int a_const_int = quantize_qasymm8(act_info.a(), output_quant_info);
+ const int b_const_int = quantize_qasymm8(act_info.b(), output_quant_info);
- min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
- max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
-
- // If the activation layer is RELU, BOUNDED_RELU or LU_BOUNDED_RELU, we can use the GEMMLowp output stage to perform this operation
- is_activationlayer_enabled = false;
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+ max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+ }
+ else
+ {
+ fuse_activation = false;
+ }
}
// Set the GEMMLowp output stage info
@@ -507,7 +543,7 @@
// In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, run_addition));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, act_info));
// Validate Col2Im
if(!skip_col2im)
@@ -516,7 +552,7 @@
}
//Validate Activation Layer
- if(is_activationlayer_enabled)
+ if(!fuse_activation)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
}
@@ -548,19 +584,14 @@
_mm_gemm.run();
}
- if(_run_addition)
- {
- CLScheduler::get().enqueue(_add_bias_kernel);
- }
-
// Reshape output matrix
if(!_skip_col2im)
{
CLScheduler::get().enqueue(_col2im_kernel, false);
}
- //Run Activation Layer if enabled
- if(_is_activationlayer_enabled)
+ //Run Activation Layer if we cannot fuse in GEMM
+ if(!_fuse_activation)
{
_activationlayer_function.run();
}
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index bcb91e0..36a120e 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -277,11 +277,15 @@
if(_is_quantized)
{
- float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / _gemmlowp_final.info()->quantization_info().scale;
+ const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = _gemmlowp_final.info()->quantization_info().uniform();
+
+ float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
int output_multiplier(0);
int output_shift(0);
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _gemmlowp_output_stage.configure(&_gemmlowp_final, nullptr, output_stage_output, output_multiplier, output_shift, _gemmlowp_final.info()->quantization_info().offset);
+ _gemmlowp_output_stage.configure(&_gemmlowp_final, nullptr, output_stage_output, output_multiplier, output_shift, oq_info.offset);
_gemmlowp_final.allocator()->allocate();
}
diff --git a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
deleted file mode 100644
index 45547e4..0000000
--- a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
-
-#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-void CLGEMMInterleave4x4::configure(const ICLTensor *input, ICLTensor *output)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLGEMMInterleave4x4Kernel>();
- k->configure(input, output);
- _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 049db1d..0286cb3 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -48,7 +49,8 @@
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
- _mm_kernel(),
+ _mm_midgard_kernel(),
+ _mm_native_kernel(),
_mm_reshaped_only_rhs_kernel(),
_mtx_b_reshape_kernel(),
_mtx_a_reduction_kernel(),
@@ -63,6 +65,7 @@
_a_offset(0),
_b_offset(0),
_is_gemm_reshaped(true),
+ _is_midgard(false),
_reshape_b_only_on_first_run(false),
_is_prepared(false),
_fuse_output_stage(false)
@@ -77,14 +80,16 @@
_is_prepared = false;
_original_b = b;
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
- _a_offset = a->info()->quantization_info().offset;
- _b_offset = b->info()->quantization_info().offset;
+ _a_offset = a->info()->quantization_info().uniform().offset;
+ _b_offset = b->info()->quantization_info().uniform().offset;
// Get the GPU target
const GPUTarget gpu_target = CLScheduler::get().target();
// Set the target for the kernels
- _mm_kernel.set_target(gpu_target);
+ _mm_midgard_kernel.set_target(gpu_target);
+ _mm_native_kernel.set_target(gpu_target);
+ _mm_reshaped_only_rhs_kernel.set_target(gpu_target);
const ICLTensor *matrix_a = a;
const ICLTensor *matrix_b = b;
@@ -103,6 +108,7 @@
// Check if we need to reshape the matrix A and matrix B
_is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target);
+ _is_midgard = gpu_target == GPUTarget::MIDGARD;
if(_is_gemm_reshaped)
{
@@ -159,8 +165,19 @@
}
else
{
- // Configure matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ if(_is_midgard)
+ {
+ // Configure matrix multiply kernel
+ _mm_midgard_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
+ else
+ {
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+ // Configure matrix multiply kernel
+ _mm_native_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
}
// Configure offset contribution kernel
@@ -178,8 +195,19 @@
}
else
{
- // Configure matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, output, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ if(_is_midgard)
+ {
+ // Configure matrix multiply kernel
+ _mm_midgard_kernel.configure(matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
+ else
+ {
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+ // Configure matrix multiply kernel
+ _mm_native_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
}
// Configure offset contribution kernel
@@ -213,8 +241,8 @@
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
- int32_t a_offset = a->quantization_info().offset;
- int32_t b_offset = b->quantization_info().offset;
+ int32_t a_offset = a->quantization_info().uniform().offset;
+ int32_t b_offset = b->quantization_info().uniform().offset;
const ITensorInfo *matrix_a_info = a;
const ITensorInfo *matrix_b_info = b;
@@ -232,6 +260,7 @@
const unsigned int k = a->dimension(0);
const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const bool is_midgard = gpu_target == GPUTarget::MIDGARD;
bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
@@ -287,9 +316,21 @@
// Output tensor auto inizialitation if not yet initialized
auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, false, reshape_info));
+ if(is_midgard)
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_info));
+ }
+ else
+ {
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+ }
}
+
// Validate offset contribution kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -308,9 +349,21 @@
}
else
{
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info));
+ if(is_midgard)
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_info));
+ }
+ else
+ {
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+ }
}
+
if(output->total_size() != 0)
{
// Validate offset contribution kernel
@@ -353,7 +406,14 @@
}
else
{
- CLScheduler::get().enqueue(_mm_kernel, false);
+ if(_is_midgard)
+ {
+ CLScheduler::get().enqueue(_mm_midgard_kernel, false);
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_mm_native_kernel, false);
+ }
}
// Run matrix A reduction kernel only if _b_offset is not equal to 0
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index f1282cb..020fbbe 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
@@ -72,4 +73,20 @@
{
return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::validate(input, bias, output, min, max);
}
+
+void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+ int result_fixedpoint_multiplier, int result_shift,
+ int min, int max)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
+ k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
+ _kernel = std::move(k);
+}
+
+Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+ int min, int max)
+{
+ return CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, min, max);
+}
+
} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 136cb5e..e76e4f6 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -34,25 +34,31 @@
namespace arm_compute
{
+namespace
+{
+constexpr int max_input_tensor_dim = 3;
+} // namespace
+
CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
{
}
-void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon)
+void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis, float epsilon)
{
// Manage intermediate buffers
_memory_group.manage(&_sumsq);
// Configure kernels
- _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
+ const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
+ _reduce_func.configure(input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE);
_normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
// Allocate intermediate tensor
_sumsq.allocator()->allocate();
}
-Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, float epsilon)
+Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int axis, float epsilon)
{
TensorShape shape(input->tensor_shape());
@@ -61,10 +67,11 @@
sum_sq.set_data_type(input->data_type());
sum_sq.set_tensor_shape(shape);
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
+ const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
// Reduce shape on axis
- shape.set(axis, 1);
+ shape.set(actual_axis, 1);
sum_sq.set_tensor_shape(shape);
ARM_COMPUTE_RETURN_ON_ERROR(CLL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 4606a66..793d5ca 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -38,16 +38,18 @@
using namespace arm_compute::misc::shape_calculator;
CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _gemm_input_gate(), _transpose_input_gate(), _accum_input_gate1(), _accum_input_gate2(), _subtract_input_gate(),
- _pixelwise_mul_input_gate(), _activation_input_gate(), _fully_connected_forget_gate(), _gemm_forget_gate(), _transpose_forget_gate(), _accum_forget_gate1(), _accum_forget_gate2(),
- _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state(), _accum_cell_state1(), _accum_cell_state2(),
- _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(),
- _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(),
- _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
- _ones_memset_kernel(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(),
+ : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
+ _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(),
+ _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(),
+ _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(),
+ _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
+ _ones_memset_kernel(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(),
+ _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(),
+ _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(),
_forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(),
- _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false),
- _is_prepared(false)
+ _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(),
+ _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false),
+ _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false)
{
}
@@ -66,6 +68,8 @@
output_state_in, cell_state_in,
scratch_buffer, output_state_out, cell_state_out, output);
+ _is_layer_norm_lstm = lstm_params.use_layer_norm();
+
// Set lstm parameters
LSTMParams<ITensorInfo> lstm_params_info;
if(lstm_params.has_peephole_opt())
@@ -121,7 +125,7 @@
_concat_weights_forget_gate.configure(input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6);
_memory_group.manage(&_forget_gate_out5);
- _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, forget_gate_bias, &_forget_gate_out5);
+ _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
_memory_group.manage(&_forget_gate_out1);
_memory_group.manage(&_forget_gate_out3);
_forget_gate_out6.allocator()->allocate();
@@ -134,7 +138,7 @@
_run_peephole_opt = true;
_memory_group.manage(&_forget_gate_out4);
_pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_forget_gate2.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+ _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
_forget_gate_out4.allocator()->allocate();
_forget_gate_out5.allocator()->allocate();
forget_gate_out = &_forget_gate_out3;
@@ -143,6 +147,20 @@
{
_forget_gate_out3.allocator()->allocate();
}
+ if(_is_layer_norm_lstm)
+ {
+ _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _memory_group.manage(&_forget_layer_norm_out1);
+ _memory_group.manage(&_forget_layer_norm_out2);
+ _mean_std_norm_forget_gate.configure(forget_gate_out);
+ _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
+ forget_gate_out->allocator()->allocate();
+ _accum_forget_gate_bias.configure(ArithmeticOperation::ADD, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+ _forget_layer_norm_out1.allocator()->allocate();
+ forget_gate_out = &_forget_layer_norm_out2;
+ }
_activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the input gate
@@ -177,7 +195,7 @@
_memory_group.manage(&_input_gate_out1);
_memory_group.manage(&_input_gate_out3);
- _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, lstm_params.input_gate_bias(), &_input_gate_out3);
+ _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
_input_gate_out2.allocator()->allocate();
input_gate_out = &_input_gate_out3;
@@ -185,7 +203,7 @@
{
_memory_group.manage(&_input_gate_out4);
_pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_input_gate2.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
_input_gate_out3.allocator()->allocate();
_input_gate_out4.allocator()->allocate();
input_gate_out = &_input_gate_out1;
@@ -194,6 +212,21 @@
{
_input_gate_out1.allocator()->allocate();
}
+
+ if(_is_layer_norm_lstm)
+ {
+ _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _memory_group.manage(&_input_layer_norm_out1);
+ _memory_group.manage(&_input_layer_norm_out2);
+ _mean_std_norm_input_gate.configure(input_gate_out);
+ _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
+ input_gate_out->allocator()->allocate();
+ _accum_input_gate_bias.configure(ArithmeticOperation::ADD, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+ _input_layer_norm_out1.allocator()->allocate();
+ input_gate_out = &_input_layer_norm_out2;
+ }
_activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
}
@@ -207,7 +240,7 @@
_cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_cell_state_out1);
- _fully_connected_cell_state.configure(input, input_to_cell_weights, cell_bias, &_cell_state_out1);
+ _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
_memory_group.manage(&_cell_state_out2);
_transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2);
_memory_group.manage(&_cell_state_out3);
@@ -215,10 +248,25 @@
_cell_state_out2.allocator()->allocate();
_memory_group.manage(&_cell_state_out4);
_accum_cell_state1.configure(ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
- _activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
+ CLTensor *cell_state_out_ptr = &_cell_state_out4;
+ if(_is_layer_norm_lstm)
+ {
+ _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _memory_group.manage(&_cell_layer_norm_out1);
+ _memory_group.manage(&_cell_layer_norm_out2);
+ _mean_std_norm_cell_gate.configure(cell_state_out_ptr);
+ _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
+ cell_state_out_ptr->allocator()->allocate();
+ _accum_cell_gate_bias.configure(ArithmeticOperation::ADD, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+ _cell_layer_norm_out1.allocator()->allocate();
+ cell_state_out_ptr = &_cell_layer_norm_out2;
+ }
+ _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info);
_memory_group.manage(&_cell_state_out5);
- _pixelwise_mul_cell_state1.configure(&_cell_state_out4, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _cell_state_out4.allocator()->allocate();
+ _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ cell_state_out_ptr->allocator()->allocate();
_pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
_accum_cell_state2.configure(ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
_cell_state_out3.allocator()->allocate();
@@ -247,7 +295,7 @@
_memory_group.manage(&_output1);
_memory_group.manage(&_output4);
- _fully_connected_output.configure(&_forget_gate_out2, &_output2, output_gate_bias, &_output4);
+ _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
_output2.allocator()->allocate();
_forget_gate_out2.allocator()->allocate();
@@ -259,7 +307,7 @@
_memory_group.manage(&_output3);
_pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_output2.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
+ _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
_output4.allocator()->allocate();
output_gate_out = &_output1;
@@ -270,6 +318,20 @@
{
_output1.allocator()->allocate();
}
+ if(_is_layer_norm_lstm)
+ {
+ _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _memory_group.manage(&_output_layer_norm_out1);
+ _memory_group.manage(&_output_layer_norm_out2);
+ _mean_std_norm_output_gate.configure(output_gate_out);
+ _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
+ output_gate_out->allocator()->allocate();
+ _accum_output_gate_bias.configure(ArithmeticOperation::ADD, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+ _output_layer_norm_out1.allocator()->allocate();
+ output_gate_out = &_output_layer_norm_out2;
+ }
_activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the output state
@@ -316,7 +378,7 @@
scratch_inputs.emplace_back(&_cell_state_out1);
scratch_inputs.emplace_back(forget_gate_out);
scratch_inputs.emplace_back(output_gate_out);
- _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+ _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer, Window::DimX);
input_gate_out->allocator()->allocate();
_cell_state_out1.allocator()->allocate();
forget_gate_out->allocator()->allocate();
@@ -370,6 +432,31 @@
const unsigned int num_batches = input->dimension(1);
const unsigned int num_cells = input_to_output_weights->dimension(1);
+ if(lstm_params.use_layer_norm())
+ {
+ // If CIFG is used, input layer normalization weights tensor is omitted
+ if(lstm_params.has_cifg_opt())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_layer_norm_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights()->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights()->dimension(0) != num_batches);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->dimension(0) != num_batches);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->dimension(0) != num_batches);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->dimension(0) != num_batches);
+ }
+
// Check peephole optimization
if(lstm_params.has_peephole_opt())
{
@@ -389,7 +476,7 @@
TensorInfo cell_state_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
// Validate forget gate
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
std::vector<const ITensorInfo *> inputs_vector;
inputs_vector.emplace_back(input);
@@ -404,6 +491,13 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
}
+ if(lstm_params.use_layer_norm())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+ }
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Validate input gate
@@ -423,7 +517,7 @@
TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &lstm_gate_concat));
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
if(lstm_params.has_peephole_opt())
{
@@ -432,6 +526,13 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
}
+
+ if(lstm_params.use_layer_norm())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+ }
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
}
else
@@ -440,9 +541,16 @@
}
// Validate cell state
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, cell_bias, &cell_state_tmp));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+ if(lstm_params.use_layer_norm())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+ }
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&cell_state_tmp, nullptr, activation_info));
ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
@@ -460,7 +568,7 @@
TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input_to_output_weights, recurrent_to_output_weights, &in_out_gate_concat));
// Validate output gate tmp
- ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
if(lstm_params.has_peephole_opt())
{
@@ -468,6 +576,13 @@
RoundingPolicy::TO_NEAREST_EVEN));
ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
}
+ if(lstm_params.use_layer_norm())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+ }
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Validate output state
@@ -497,7 +612,7 @@
inputs_vector_info_raw.push_back(&forget_gate);
inputs_vector_info_raw.push_back(&output_gate_tmp);
- ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer, Window::DimX));
return Status{};
}
@@ -514,7 +629,13 @@
if(_run_peephole_opt)
{
CLScheduler::get().enqueue(_pixelwise_mul_forget_gate);
- _accum_forget_gate2.run();
+ _accum_forget_gate1.run();
+ }
+ if(_is_layer_norm_lstm)
+ {
+ _mean_std_norm_forget_gate.run();
+ CLScheduler::get().enqueue(_pixelwise_mul_forget_gate_coeff);
+ CLScheduler::get().enqueue(_accum_forget_gate_bias);
}
CLScheduler::get().enqueue(_activation_forget_gate);
@@ -530,7 +651,14 @@
if(_run_peephole_opt)
{
CLScheduler::get().enqueue(_pixelwise_mul_input_gate);
- _accum_input_gate2.run();
+ _accum_input_gate1.run();
+ }
+
+ if(_is_layer_norm_lstm)
+ {
+ _mean_std_norm_input_gate.run();
+ CLScheduler::get().enqueue(_pixelwise_mul_input_gate_coeff);
+ CLScheduler::get().enqueue(_accum_input_gate_bias);
}
CLScheduler::get().enqueue(_activation_input_gate);
}
@@ -539,6 +667,12 @@
CLScheduler::get().enqueue(_transpose_cell_state);
_gemm_cell_state1.run();
CLScheduler::get().enqueue(_accum_cell_state1);
+ if(_is_layer_norm_lstm)
+ {
+ _mean_std_norm_cell_gate.run();
+ CLScheduler::get().enqueue(_pixelwise_mul_cell_gate_coeff);
+ CLScheduler::get().enqueue(_accum_cell_gate_bias);
+ }
CLScheduler::get().enqueue(_activation_cell_state);
CLScheduler::get().enqueue(_pixelwise_mul_cell_state1);
CLScheduler::get().enqueue(_pixelwise_mul_cell_state2);
@@ -554,7 +688,13 @@
if(_run_peephole_opt)
{
CLScheduler::get().enqueue(_pixelwise_mul_output_state1);
- _accum_output2.run();
+ _accum_output1.run();
+ }
+ if(_is_layer_norm_lstm)
+ {
+ _mean_std_norm_output_gate.run();
+ CLScheduler::get().enqueue(_pixelwise_mul_output_gate_coeff);
+ CLScheduler::get().enqueue(_accum_output_gate_bias);
}
CLScheduler::get().enqueue(_activation_output);
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
new file mode 100644
index 0000000..11cf85e
--- /dev/null
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -0,0 +1,556 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+namespace
+{
+// Quantization info structures used in the LSTMQuantize layer
+const QuantizationInfo qasymm(1.f / 128.f, 128);
+const QuantizationInfo qsymm_3(8.f / 32768.f, 0); // qsymm16 with 3 integer bit
+const QuantizationInfo qsymm_4(16.f / 32768.f, 0); // qsymm16 with 4 integer bit
+const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit
+} // namespace
+
+CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
+ _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(),
+ _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(),
+ _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr),
+ _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr),
+ _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(),
+ _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(),
+ _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false)
+{
+}
+
+void CLLSTMLayerQuantized::configure(const ICLTensor *input,
+ const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+ ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+ ICLTensor *cell_state_out, ICLTensor *output_state_out)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+
+ ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+ input_to_output_weights->info(),
+ recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+ input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+
+ const int input_size = input->info()->dimension(0);
+ const int batch_size = input->info()->dimension(1);
+ const int output_size = input_to_input_weights->info()->dimension(1);
+
+ const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
+
+ auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+ auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+
+ _input_to_input_weights = input_to_input_weights;
+ _input_to_forget_weights = input_to_forget_weights;
+ _input_to_cell_weights = input_to_cell_weights;
+ _input_to_output_weights = input_to_output_weights;
+ _recurrent_to_input_weights = recurrent_to_input_weights;
+ _recurrent_to_forget_weights = recurrent_to_forget_weights;
+ _recurrent_to_cell_weights = recurrent_to_cell_weights;
+ _recurrent_to_output_weights = recurrent_to_output_weights;
+ _input_gate_bias = input_gate_bias;
+ _forget_gate_bias = forget_gate_bias;
+ _cell_bias = cell_bias;
+ _output_gate_bias = output_gate_bias;
+
+ // Weights concatenation
+ std::vector<const ICLTensor *> inputs_weights_vector;
+ inputs_weights_vector.emplace_back(input_to_input_weights);
+ inputs_weights_vector.emplace_back(input_to_forget_weights);
+ inputs_weights_vector.emplace_back(input_to_cell_weights);
+ inputs_weights_vector.emplace_back(input_to_output_weights);
+
+ std::vector<const ICLTensor *> recurrent_weights_vector;
+ recurrent_weights_vector.emplace_back(recurrent_to_input_weights);
+ recurrent_weights_vector.emplace_back(recurrent_to_forget_weights);
+ recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
+ recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
+
+ _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY);
+
+ _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY);
+
+ std::vector<const ICLTensor *> weights_vector;
+ weights_vector.emplace_back(&_recurrent_weights);
+ weights_vector.emplace_back(&_input_weights);
+
+ _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ _concat_weights.configure(weights_vector, &_weights, Window::DimX);
+ _transpose_weights.configure(&_weights, &_weights_transposed);
+
+ // Input concatenation
+ std::vector<const ICLTensor *> input_vector;
+ input_vector.emplace_back(input);
+ input_vector.emplace_back(output_state_in);
+
+ _memory_group.manage(&_input);
+ _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+ _concat_inputs.configure(input_vector, &_input, Window::DimX);
+
+ // Bias concatenation
+ std::vector<const ICLTensor *> bias_vector;
+ bias_vector.emplace_back(input_gate_bias);
+ bias_vector.emplace_back(forget_gate_bias);
+ bias_vector.emplace_back(cell_bias);
+ bias_vector.emplace_back(output_gate_bias);
+
+ _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32));
+ _concat_bias.configure(bias_vector, &_bias, Window::DimX);
+
+ // Invert the offset for gemmlowp
+ _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
+ _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+
+ // Run gemmlowp
+ _memory_group.manage(&_output_highp);
+ _output_highp.allocator()->init(TensorInfo(TensorShape(4 * output_size, batch_size), 1, DataType::S32));
+ _gemmlowp.configure(&_input, &_weights_transposed, nullptr, &_output_highp);
+ _input.allocator()->allocate();
+
+ // Set the offset back
+ _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
+ _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+
+ // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
+ _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
+
+ const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+ _memory_group.manage(&_output_lowp);
+ _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
+ _output_highp.allocator()->allocate();
+ _bias.allocator()->allocate();
+
+ // Get the gate tensors
+ if(batch_size > 1)
+ {
+ _memory_group.manage(&_input_gate_input);
+ _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+ _memory_group.manage(&_forget_gate_input);
+ _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+ _memory_group.manage(&_input_modulation_gate_input);
+ _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+ _memory_group.manage(&_output_gate_input);
+ _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+ _output_lowp.allocator()->allocate();
+ }
+ else
+ {
+ _memory_group.manage(&_input_gate_input);
+ _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size });
+ _memory_group.manage(&_forget_gate_input);
+ _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+ _memory_group.manage(&_input_modulation_gate_input);
+ _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+ _memory_group.manage(&_output_gate_input);
+ _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+ _output_lowp.allocator()->allocate();
+ }
+
+ // Forget gate
+ _memory_group.manage(&_forget_gate_output);
+ _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _forget_gate_input.allocator()->allocate();
+
+ // Input gate
+ _memory_group.manage(&_input_gate_output);
+ _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _input_gate_input.allocator()->allocate();
+
+ // Input modulation gate equation
+ _memory_group.manage(&_input_modulation_gate_output);
+ _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+ _input_modulation_gate_input.allocator()->allocate();
+
+ // Output gate
+ _memory_group.manage(&_output_gate_output);
+ _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _output_gate_input.allocator()->allocate();
+
+ // Long term memory
+ _memory_group.manage(&_cell_state_tmp1);
+ _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+ _mul_forget_gate_cell_state.configure(&_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _forget_gate_output.allocator()->allocate();
+
+ _memory_group.manage(&_cell_state_tmp2);
+ _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+ _mul_input_gate_input_mod_gate.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _input_modulation_gate_output.allocator()->allocate();
+ _input_gate_output.allocator()->allocate();
+
+ _add_cell_state_tmps.configure(&_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE);
+ _cell_state_tmp1.allocator()->allocate();
+ _cell_state_tmp2.allocator()->allocate();
+
+ // Short term memory
+ _memory_group.manage(&_output_state_tmp);
+ _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+
+ _memory_group.manage(&_output_state_out_symm);
+ _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _mul_output_state_tmp_output_gate.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _output_gate_output.allocator()->allocate();
+ _output_state_tmp.allocator()->allocate();
+
+ // Requantize the output state from QSYMM16 to QASYMM8
+ _memory_group.manage(&_output_state_out_f32);
+ _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+ _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32);
+ _output_state_out_symm.allocator()->allocate();
+
+ _quantize.configure(&_output_state_out_f32, output_state_out);
+ _output_state_out_f32.allocator()->allocate();
+}
+
+Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
+ const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
+ output_state_in, cell_state_out, output_state_out);
+
+ const int input_size = input->dimension(0);
+ const int batch_size = input->dimension(1);
+ const int output_size = input_to_input_weights->dimension(1);
+
+ // Dimensionality checks
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(input_to_input_weights->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
+
+ TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
+ TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
+ TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+ TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
+ TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+
+ // Shape checks
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
+
+ // Data type checks
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
+
+ // Quantization checks
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
+
+ // Validate internal functions
+ // _concat_input_weights
+ std::vector<const ITensorInfo *> inputs_weights_vector;
+ inputs_weights_vector.emplace_back(input_to_input_weights);
+ inputs_weights_vector.emplace_back(input_to_forget_weights);
+ inputs_weights_vector.emplace_back(input_to_cell_weights);
+ inputs_weights_vector.emplace_back(input_to_output_weights);
+ const QuantizationInfo qweights = input_to_input_weights->quantization_info(); // Weights quantization
+ const TensorInfo input_weights(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_weights_vector, &input_weights, Window::DimY));
+
+ // _concat_recurrent_weights
+ std::vector<const ITensorInfo *> recurrent_weights_vector;
+ recurrent_weights_vector.emplace_back(recurrent_to_input_weights);
+ recurrent_weights_vector.emplace_back(recurrent_to_forget_weights);
+ recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
+ recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
+ const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+
+ // _concat_weights
+ std::vector<const ITensorInfo *> weights_vector;
+ weights_vector.emplace_back(&recurrent_weights);
+ weights_vector.emplace_back(&input_weights);
+ const TensorInfo weights(TensorShape(input_size + output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
+ // _transpose_weights
+ const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
+ TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed));
+
+ // _concat_inputs
+ std::vector<const ITensorInfo *> input_vector;
+ input_vector.emplace_back(input);
+ input_vector.emplace_back(output_state_in);
+ TensorInfo input_concatenated(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(input_vector, &input_concatenated, Window::DimX));
+
+ // _concat_bias
+ std::vector<const ITensorInfo *> bias_vector;
+ bias_vector.emplace_back(input_gate_bias);
+ bias_vector.emplace_back(forget_gate_bias);
+ bias_vector.emplace_back(cell_bias);
+ bias_vector.emplace_back(output_gate_bias);
+
+ const TensorInfo bias_concatenated(TensorShape(4 * output_size), 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(bias_vector, &bias_concatenated, Window::DimX));
+
+ // Invert the offset for gemmlowp
+ input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
+ weights_transposed.set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+
+ // _gemmlowp
+ const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+
+ // Set the offset back
+ input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
+ weights_transposed.set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+
+ // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
+ const TensorInfo output_lowp(output_highp.tensor_shape(), 1, DataType::QSYMM16, qsymm_3);
+
+ const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
+ ARM_COMPUTE_UNUSED(multiplier);
+ ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
+ // _output_stage
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp));
+
+ TensorInfo input_gate_input;
+ TensorInfo forget_gate_input;
+ TensorInfo input_modulation_gate_input;
+ TensorInfo output_gate_input;
+
+ if(batch_size > 1)
+ {
+ // _slice_input_tensor
+ input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+ // _slice_forget_tensor
+ forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+ // _slice_cell_tensor
+ input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+ // _slice_output_tensor
+ output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+ }
+ else
+ {
+ // _slice_input_tensor
+ input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+ // _slice_forget_tensor
+ forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+ // _slice_cell_tensor
+ input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+ // _slice_output_tensor
+ output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+ }
+
+ // _sigmoid_forget_gate
+ const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ // _sigmoid_input_gate
+ const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ // _tanh_modulation_gate
+ const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+ // _sigmoid_output_gate
+ const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+ // _mul_forget_gate_cell_state
+ const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+
+ // _mul_input_gate_input_mod_gate
+ const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+
+ // _add_cell_state_tmps
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+
+ // _tanh_modulation_gate
+ const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+
+ // _mul_output_state_tmp_output_gate
+ const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+
+ // _dequantize
+ const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&output_state_out_symm, &output_state_out_f32));
+
+ // _quantize
+ ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out));
+
+ if(cell_state_out->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
+ }
+
+ if(output_state_out->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_out);
+ }
+
+ return Status{};
+}
+
+void CLLSTMLayerQuantized::run()
+{
+ prepare();
+
+ // Acquire all the temporaries
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Concat and transpose the input
+ _concat_inputs.run();
+
+ // Run gemmlowp
+ _gemmlowp.run();
+ _output_stage.run();
+
+ // Slice the results
+ _slice_input_tensor.run();
+ _slice_forget_tensor.run();
+ _slice_cell_tensor.run();
+ _slice_output_tensor.run();
+
+ // Gates
+ // Forget gate
+ _sigmoid_forget_gate.run();
+
+ // Input gate
+ _sigmoid_input_gate.run();
+
+ // Input modulation gate
+ _tanh_modulation_gate.run();
+
+ // Output gate
+ _sigmoid_output_gate.run();
+
+ // Cell state (long term memory)
+ _mul_forget_gate_cell_state.run();
+ _mul_input_gate_input_mod_gate.run();
+ _add_cell_state_tmps.run();
+
+ // Output state (short term memory)
+ _tanh_output_state.run();
+ _mul_output_state_tmp_output_gate.run();
+
+ // Requantize output state from QSYMM16 to QASYMM16
+ _dequantize.run();
+ _quantize.run();
+}
+
+void CLLSTMLayerQuantized::prepare()
+{
+ if(!_is_prepared)
+ {
+ _input_weights.allocator()->allocate();
+ _concat_input_weights.run();
+
+ _input_to_input_weights->mark_as_unused();
+ _input_to_forget_weights->mark_as_unused();
+ _input_to_cell_weights->mark_as_unused();
+ _input_to_output_weights->mark_as_unused();
+
+ _recurrent_weights.allocator()->allocate();
+ _concat_recurrent_weights.run();
+ _recurrent_to_input_weights->mark_as_unused();
+ _recurrent_to_forget_weights->mark_as_unused();
+ _recurrent_to_cell_weights->mark_as_unused();
+ _recurrent_to_output_weights->mark_as_unused();
+
+ _weights.allocator()->allocate();
+ _concat_weights.run();
+
+ _input_weights.mark_as_unused();
+ _input_weights.allocator()->free();
+ _recurrent_weights.mark_as_unused();
+ _recurrent_weights.allocator()->free();
+
+ _weights_transposed.allocator()->allocate();
+ _transpose_weights.run();
+
+ _weights.mark_as_unused();
+ _weights.allocator()->free();
+
+ _bias.allocator()->allocate();
+ _concat_bias.run();
+ _input_gate_bias->mark_as_unused();
+ _forget_gate_bias->mark_as_unused();
+ _cell_bias->mark_as_unused();
+ _output_gate_bias->mark_as_unused();
+
+ _is_prepared = true;
+ }
+}
+
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
similarity index 64%
rename from src/runtime/CL/functions/CLGEMMTranspose1xW.cpp
rename to src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
index d054e01..802a2fc 100644
--- a/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,18 +21,23 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h"
+#include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
#include "arm_compute/core/Types.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
-void CLGEMMTranspose1xW::configure(const ICLTensor *input, ICLTensor *output)
+namespace arm_compute
{
- auto k = arm_compute::support::cpp14::make_unique<CLGEMMTranspose1xWKernel>();
- k->configure(input, output);
+void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float epsilon)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLMeanStdDevNormalizationKernel>();
+ k->configure(input, output, epsilon);
_kernel = std::move(k);
-}
\ No newline at end of file
+}
+
+Status CLMeanStdDevNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
+{
+ return CLMeanStdDevNormalizationKernel::validate(input, output, epsilon);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
new file mode 100644
index 0000000..d463ef9
--- /dev/null
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+{
+ if(output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ border_handler.configure(broadcasted_info, border_size, BorderMode::REPLICATE);
+ }
+ }
+}
+} // namespace
+
+void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::PRELU, input, alpha, output);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input, alpha, output);
+}
+
+Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::PRELU, input, alpha, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index cbe1ce3..086017a 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@
PixelValue pixel_value(0.f);
if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
{
- pixel_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+ pixel_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
}
switch(input->info()->data_layout())
{
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 9f99d2d..38f0a75 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -105,6 +105,16 @@
intermediate_kernel_op = ReductionOperation::PROD;
last_kernel_op = ReductionOperation::PROD;
break;
+ case ReductionOperation::MIN:
+ first_kernel_op = ReductionOperation::MIN;
+ intermediate_kernel_op = ReductionOperation::MIN;
+ last_kernel_op = ReductionOperation::MIN;
+ break;
+ case ReductionOperation::MAX:
+ first_kernel_op = ReductionOperation::MAX;
+ intermediate_kernel_op = ReductionOperation::MAX;
+ last_kernel_op = ReductionOperation::MAX;
+ break;
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -179,6 +189,60 @@
last_kernel_op = ReductionOperation::PROD;
pixelValue = PixelValue(1, input->info()->data_type());
break;
+ case ReductionOperation::MIN:
+ first_kernel_op = ReductionOperation::MIN;
+ intermediate_kernel_op = ReductionOperation::MIN;
+ last_kernel_op = ReductionOperation::MIN;
+ switch(input->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ pixelValue = PixelValue(std::numeric_limits<float>::max());
+ break;
+ }
+ case DataType::F16:
+ {
+ pixelValue = PixelValue(static_cast<half>(65504.0f));
+ break;
+ }
+ case DataType::QASYMM8:
+ {
+ pixelValue = PixelValue(255, input->info()->data_type(), input->info()->quantization_info());
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported DataType");
+ }
+ }
+ break;
+ case ReductionOperation::MAX:
+ first_kernel_op = ReductionOperation::MAX;
+ intermediate_kernel_op = ReductionOperation::MAX;
+ last_kernel_op = ReductionOperation::MAX;
+ switch(input->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ pixelValue = PixelValue(-std::numeric_limits<float>::max());
+ break;
+ }
+ case DataType::F16:
+ {
+ pixelValue = PixelValue(static_cast<half>(-65504.0f));
+ break;
+ }
+ case DataType::QASYMM8:
+ {
+ pixelValue = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported DataType");
+ }
+ }
+ break;
default:
ARM_COMPUTE_ERROR("Not supported");
}
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index a24b72e..fa6e82e 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,7 +44,7 @@
if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
- _memset_kernel.configure(output, PixelValue());
+ _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
}
_space_to_batch_kernel.configure(input, block_shape, paddings, output);
}
@@ -56,14 +56,14 @@
if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
- _memset_kernel.configure(output, PixelValue());
+ _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
}
_space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
}
Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
return Status{};
@@ -72,7 +72,7 @@
Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
return Status{};
diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
new file mode 100644
index 0000000..f02a13f
--- /dev/null
+++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLSpaceToDepthLayer::CLSpaceToDepthLayer()
+ : _space_to_depth_kernel()
+{
+}
+
+void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+{
+ _space_to_depth_kernel.configure(input, output, block_shape);
+}
+
+Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
+{
+ return CLSpaceToDepthLayerKernel::validate(input, output, block_shape);
+}
+
+void CLSpaceToDepthLayer::run()
+{
+ CLScheduler::get().enqueue(_space_to_depth_kernel, true);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
deleted file mode 100644
index a8667c3..0000000
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLWidthConcatenateLayer::CLWidthConcatenateLayer() // NOLINT
- : _concat_kernels_vector(),
- _concat_x2_kernel(),
- _concat_x4_kernel(),
- _num_inputs(0)
-{
-}
-
-Status CLWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output) // NOLINT
-{
- const unsigned int num_inputs = inputs_vector.size();
-
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
-
- // Output auto inizialitation if not yet initialized
- TensorInfo tmp_output_info = *output->clone();
- const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
- auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
-
- switch(num_inputs)
- {
- case 2:
- // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
- ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], &tmp_output_info));
- break;
- case 4:
- // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
- ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], &tmp_output_info));
- break;
- default:
- unsigned int width_offset = 0;
- // Validate generic case of WidthConcatenate kernel
- for(const auto &input : inputs_vector)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
- width_offset += input->dimension(0);
- }
- break;
- }
-
- return Status{};
-}
-
-void CLWidthConcatenateLayer::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
-{
- _num_inputs = inputs_vector.size();
-
- std::vector<ITensorInfo *> inputs_vector_info;
- for(unsigned int i = 0; i < _num_inputs; i++)
- {
- inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
- }
- const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-
- ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
-
- switch(_num_inputs)
- {
- case 2:
- // Configure WidthConcatenate2Tensors kernel
- _concat_x2_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), output);
- break;
- case 4:
- // Configure WidthConcatenate4Tensors kernel
- _concat_x4_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
- break;
- default:
- // Configure generic case WidthConcatenate kernels
- _concat_kernels_vector.resize(_num_inputs);
-
- unsigned int width_offset = 0;
- for(unsigned int i = 0; i < _num_inputs; ++i)
- {
- _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
- width_offset += inputs_vector.at(i)->info()->dimension(0);
- }
- break;
- }
-}
-
-void CLWidthConcatenateLayer::run()
-{
- cl::CommandQueue q = CLScheduler::get().queue();
-
- switch(_num_inputs)
- {
- case 2:
- CLScheduler::get().enqueue(_concat_x2_kernel, true);
- break;
- case 4:
- CLScheduler::get().enqueue(_concat_x4_kernel, true);
- break;
- default:
- for(unsigned int i = 0; i < _num_inputs; ++i)
- {
- CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
- }
- break;
- }
-}
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index d3c3f98..a5db977 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -158,6 +158,9 @@
const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, input->data_layout());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size");
+
// Check if the Winograd configuration requires fast math
if(!enable_fast_math)
{
@@ -189,13 +192,7 @@
GEMMLowpOutputStageInfo(), (input->data_type() == DataType::F16))));
// Configure output transform
- ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, winograd_info));
-
- // Validate Activation Layer
- if(act_info.enabled())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, winograd_info, act_info));
return Status{};
}