arm_compute v19.02
Change-Id: I853a3ecf38f206da13c1b03640c8adf73c20477c
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
new file mode 100644
index 0000000..a6393c5
--- /dev/null
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel>();
+ k->configure(input, output, axis, op);
+ _kernel = std::move(k);
+}
+
+Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+ return CLReductionOperationKernel::validate(input, output, axis, op);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
deleted file mode 100644
index 0b05058..0000000
--- a/src/runtime/CL/functions/CLArithmeticAddition.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLArithmeticAdditionKernel>();
- k->configure(input1, input2, output, policy);
- _kernel = std::move(k);
-
- if(output->info()->dimension(0) > 1)
- {
- ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
- if(broadcasted_info->info()->dimension(0) == 1)
- {
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
- }
- }
-}
-
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
- return CLArithmeticAdditionKernel::validate(input1, input2, output, policy);
-}
diff --git a/src/runtime/CL/functions/CLArithmeticDivision.cpp b/src/runtime/CL/functions/CLArithmeticDivision.cpp
deleted file mode 100644
index 1c2849c..0000000
--- a/src/runtime/CL/functions/CLArithmeticDivision.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticDivision.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticDivisionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLArithmeticDivisionKernel>();
- k->configure(input1, input2, output);
- _kernel = std::move(k);
-
- if(output->info()->dimension(0) > 1)
- {
- ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
- if(broadcasted_info->info()->dimension(0) == 1)
- {
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
- }
- }
-}
-
-Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
- return CLArithmeticDivisionKernel::validate(input1, input2, output);
-}
diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
deleted file mode 100644
index e661f6a..0000000
--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionKernel>();
- k->configure(input1, input2, output, policy);
- _kernel = std::move(k);
-
- if(output->info()->dimension(0) > 1)
- {
- ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
- if(broadcasted_info->info()->dimension(0) == 1)
- {
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
- }
- }
-}
-
-Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
- return CLArithmeticSubtractionKernel::validate(input1, input2, output, policy);
-}
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
new file mode 100644
index 0000000..e0ffcdb
--- /dev/null
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLCast.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
+ k->configure(input, output, policy, 0);
+ _kernel = std::move(k);
+}
+
+Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy)
+{
+ return CLDepthConvertLayerKernel::validate(input, output, policy, 0);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
new file mode 100644
index 0000000..86c9c31
--- /dev/null
+++ b/src/runtime/CL/functions/CLComparison.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLComparison.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
+ k->configure(input1, input2, output, operation);
+ _kernel = std::move(k);
+
+ if(output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
+
+Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+{
+ return CLComparisonKernel::validate(input1, input2, output, operation);
+}
+
+template <ComparisonOperation COP>
+void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
+ k->configure(input1, input2, output, COP);
+ _kernel = std::move(k);
+
+ if(output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
+
+template <ComparisonOperation COP>
+Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLComparisonKernel::validate(input1, input2, output, COP);
+}
+
+// Supported Specializations
+template class CLComparisonStatic<ComparisonOperation::Equal>;
+template class CLComparisonStatic<ComparisonOperation::NotEqual>;
+template class CLComparisonStatic<ComparisonOperation::Greater>;
+template class CLComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class CLComparisonStatic<ComparisonOperation::Less>;
+template class CLComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
index 409d3c9..24c152f 100644
--- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp
+++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index e07feb2..9da02c1 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -158,6 +158,18 @@
_scaled_output.allocator()->allocate();
}
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
+{
+ configure(input, weights, bias, output, info, 0, 0, weights_info);
+}
+
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
+{
+ return CLDeconvolutionLayer::validate(input, weights, bias, output, info, 0, 0, weights_info);
+}
+
void CLDeconvolutionLayer::run()
{
prepare();
diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
index b5e8fd9..e46647a 100644
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,7 +66,7 @@
for(unsigned int i = 0; i < _num_inputs; i++)
{
_concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
- _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
depth_offset += inputs_vector.at(i)->info()->dimension(2);
}
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index 2e52e8a..dbf71ac 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -28,8 +28,8 @@
#include <utility>
-using namespace arm_compute;
-
+namespace arm_compute
+{
void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
{
auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
@@ -41,3 +41,4 @@
{
return CLDepthConvertLayerKernel::validate(input, output, policy, shift);
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 497cdae..15cbfce 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,18 +26,21 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc;
using namespace arm_compute::misc::shape_calculator;
-CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3()
- : _kernel(nullptr), _border_handler()
+CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _kernel(nullptr), _border_handler(), _permute_input_to_nchw(), _permute_weights_to_nchw(), _permute_output_to_nhwc(), _reshape_weights(), _permuted_input(),
+ _permuted_weights(), _permuted_output(), _original_weights(nullptr), _needs_permute(false), _needs_weights_reshape(false), _is_prepared(false)
{
}
@@ -47,25 +50,79 @@
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- if(input->info()->data_layout() == DataLayout::NCHW)
+ const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
+
+ _needs_permute = is_nhwc && (depth_multiplier > 1);
+ _needs_weights_reshape = is_nhwc && (depth_multiplier == 1)
+ && is_data_type_quantized_asymmetric(input->info()->data_type());
+ _is_prepared = false;
+ _original_weights = weights;
+
+ ICLTensor *input_to_use = input;
+ const ICLTensor *weights_to_use = weights;
+ ICLTensor *output_to_use = output;
+
+ const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+ DepthwiseConvolutionReshapeInfo info;
+ info.c0 = 4;
+ info.transpose = is_stride_1 && is_dot8_supported;
+
+ if(_needs_permute)
{
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input_to_nchw.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+ // Configure the function to transform the weights tensor from HWI -> IHW
+ _permute_weights_to_nchw.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+ input_to_use = &_permuted_input;
+ weights_to_use = &_permuted_weights;
+ output_to_use = &_permuted_output;
+
_kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
}
+ else if(is_nhwc)
+ {
+ if(_needs_weights_reshape)
+ {
+ _reshape_weights.configure(weights, &_permuted_weights, info);
+ weights_to_use = &_permuted_weights;
+ }
+ _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+ }
else
{
- _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+ _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
}
+ // Configure kernel
_kernel->set_target(CLScheduler::get().target());
- _kernel->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info);
+ // Permute output if needed
+ if(_needs_permute)
+ {
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+ _permute_output_to_nhwc.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+ // Allocate tensors
+ _permuted_input.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
+ }
// Configure border handler
PixelValue &&zero_value(0.f);
if(is_data_type_quantized_asymmetric(input->info()->data_type()))
{
zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
}
- _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
+ _border_handler.configure(input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
}
Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -75,23 +132,99 @@
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- if(input->data_layout() == DataLayout::NCHW)
+ const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
+ const bool needs_permute = is_nhwc && (depth_multiplier > 1);
+ const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1);
+ const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+ DepthwiseConvolutionReshapeInfo info;
+ info.c0 = 4;
+ info.transpose = is_stride_1 && is_dot8_supported;
+
+ if(needs_permute)
{
- return CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target);
+ TensorShape permuted_input_shape = input->tensor_shape();
+ TensorShape permuted_weights_shape = weights->tensor_shape();
+ TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+
+ permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
+ permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
+ permute(permuted_output_shape, PermutationVector(1U, 2U, 0U));
+
+ const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW);
+ const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
+ const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target));
+ }
+ else if(is_nhwc)
+ {
+ if(needs_weights_reshape)
+ {
+ auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
+ act_info));
+ }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target));
}
- return CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ return Status{};
}
void CLDepthwiseConvolutionLayer3x3::run()
{
+ prepare();
+
+ _memory_group.acquire();
+
+ if(_needs_permute)
+ {
+ _permute_input_to_nchw.run();
+ }
CLScheduler::get().enqueue(_border_handler);
CLScheduler::get().enqueue(*_kernel);
+
+ if(_needs_permute)
+ {
+ _permute_output_to_nhwc.run();
+ }
+
+ _memory_group.release();
+}
+
+void CLDepthwiseConvolutionLayer3x3::prepare()
+{
+ if(!_is_prepared)
+ {
+ if(_needs_permute)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ _permuted_weights.allocator()->allocate();
+ _permute_weights_to_nchw.run();
+ _original_weights->mark_as_unused();
+ }
+
+ if(_needs_weights_reshape)
+ {
+ ARM_COMPUTE_ERROR_ON(_needs_permute);
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+ _permuted_weights.allocator()->allocate();
+ CLScheduler::get().enqueue(_reshape_weights);
+ _original_weights->mark_as_unused();
+ }
+ _is_prepared = true;
+ }
}
CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
: _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _activationlayer_function(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(),
- _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
+ _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr),
+ _optimised_function(nullptr)
{
}
@@ -104,98 +237,110 @@
const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
- const size_t weights_w = weights->info()->dimension(idx_w);
- const size_t weights_h = weights->info()->dimension(idx_h);
- const size_t weights_z = weights->info()->dimension(idx_c);
+ const bool can_run_optimised_3x3_kernel = (weights->info()->dimension(idx_w) == 3) && (weights->info()->dimension(idx_h) == 3);
- _is_prepared = false;
- _original_weights = weights;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-
- bool append_bias = (biases != nullptr) && !_is_quantized;
- const GPUTarget gpu_target = CLScheduler::get().target();
-
- // Calculate output shape
- TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
- // Output width and height
- const unsigned int conv_w = output_shape[idx_w];
- const unsigned int conv_h = output_shape[idx_h];
-
- // Set up intermediate tensors
- const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
- const size_t conv_size = conv_w * conv_h;
-
- // Im2Col configuration
- TensorShape shape_im2col = input->info()->tensor_shape();
- shape_im2col.set(0, patch_size);
- shape_im2col.set(1, conv_size);
- shape_im2col.set(2, weights_z);
- _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- _im2col_kernel.set_target(gpu_target);
- _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
- CLScheduler::get().tune_kernel_static(_im2col_kernel);
-
- // Weights reshape configuration
- const TensorShape shape_weights_reshape(patch_size, weights_z);
- _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
- _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
-
- // GEMV configuration
- DataType v2mm_dt = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
- TensorShape shape_v2mm_out = input->info()->tensor_shape();
- shape_v2mm_out.set(0, conv_size * weights_z);
- shape_v2mm_out.set(1, 1);
- shape_v2mm_out.set(2, 1);
- _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
- _v2mm_kernel.set_target(gpu_target);
- _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
- CLScheduler::get().tune_kernel_static(_v2mm_kernel);
- _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
- _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
-
- // Output staged configuration
- if(_is_quantized)
+ if(bool(can_run_optimised_3x3_kernel))
{
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
-
- float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
- _output_reshaped.allocator()->allocate();
+ auto f = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3>();
+ f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ _optimised_function = std::move(f);
}
-
- // Fill borders on inputs
- PixelValue zero_in(static_cast<int32_t>(0));
- PixelValue zero_w(static_cast<int32_t>(0));
- if(_is_quantized)
+ else
{
- zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
- zero_w = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
- }
- BorderSize border_size = _v2mm_kernel.border_size();
- _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
+ const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
- border_size.bottom = 0;
- _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
+ const size_t weights_w = weights->info()->dimension(idx_w);
+ const size_t weights_h = weights->info()->dimension(idx_h);
+ const size_t weights_z = weights->info()->dimension(idx_c);
- // Allocate intermediate tensors
- _input_reshaped.allocator()->allocate();
- _v2mm_output.allocator()->allocate();
+ _is_prepared = false;
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- //Configure Activation Layer
- _is_activationlayer_enabled = act_info.enabled();
+ bool append_bias = (biases != nullptr) && !_is_quantized;
+ const GPUTarget gpu_target = CLScheduler::get().target();
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(output, nullptr, act_info);
+ // Calculate output shape
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+ // Output width and height
+ const unsigned int conv_w = output_shape[idx_w];
+ const unsigned int conv_h = output_shape[idx_h];
+
+ // Set up intermediate tensors
+ const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
+ const size_t conv_size = conv_w * conv_h;
+
+ // Im2Col configuration
+ TensorShape shape_im2col = input->info()->tensor_shape();
+ shape_im2col.set(0, patch_size);
+ shape_im2col.set(1, conv_size);
+ shape_im2col.set(2, weights_z);
+ _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+ _im2col_kernel.set_target(gpu_target);
+ _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+ CLScheduler::get().tune_kernel_static(_im2col_kernel);
+
+ // Weights reshape configuration
+ const TensorShape shape_weights_reshape(patch_size, weights_z);
+ _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+ _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
+
+ // GEMV configuration
+ DataType v2mm_dt = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
+ TensorShape shape_v2mm_out = input->info()->tensor_shape();
+ shape_v2mm_out.set(0, conv_size * weights_z);
+ shape_v2mm_out.set(1, 1);
+ shape_v2mm_out.set(2, 1);
+ _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+ _v2mm_kernel.set_target(gpu_target);
+ _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+ CLScheduler::get().tune_kernel_static(_v2mm_kernel);
+ _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+ _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
+
+ // Output staged configuration
+ if(_is_quantized)
+ {
+ const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+ int output_multiplier, output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
+ _output_reshaped.allocator()->allocate();
+ }
+
+ // Fill borders on inputs
+ PixelValue zero_in(static_cast<int32_t>(0));
+ PixelValue zero_w(static_cast<int32_t>(0));
+ if(_is_quantized)
+ {
+ zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
+ zero_w = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
+ }
+ BorderSize border_size = _v2mm_kernel.border_size();
+ _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
+
+ border_size.bottom = 0;
+ _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
+
+ // Allocate intermediate tensors
+ _input_reshaped.allocator()->allocate();
+ _v2mm_output.allocator()->allocate();
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
}
}
@@ -204,55 +349,64 @@
{
const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
+ const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
- const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- const bool append_bias = (biases != nullptr) && !is_quantized;
- const TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
- const size_t weights_w = weights->dimension(idx_w);
- const size_t weights_h = weights->dimension(idx_h);
- const size_t weights_z = weights->dimension(idx_c);
- const unsigned int conv_w = output_shape[idx_w];
- const unsigned int conv_h = output_shape[idx_h];
- const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
- const size_t conv_size = conv_w * conv_h;
-
- TensorShape shape_im2col = input->tensor_shape();
- shape_im2col.set(0, patch_size);
- shape_im2col.set(1, conv_size);
- shape_im2col.set(2, weights_z);
- TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
-
- const TensorShape shape_weights_reshape(patch_size, weights_z);
- TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseWeightsReshapeKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
-
- DataType v2mm_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
- TensorShape shape_v2mm_out = input->tensor_shape();
- shape_v2mm_out.set(0, conv_size * weights_z);
- shape_v2mm_out.set(1, 1);
- shape_v2mm_out.set(2, 1);
- TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
-
- TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
-
- if(is_quantized)
+ if(can_run_optimised_3x3_kernel)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
- }
+ const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
- // Validate Activation Layer
- if(act_info.enabled())
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
+
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ const bool append_bias = (biases != nullptr) && !is_quantized;
+ const TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ const size_t weights_w = weights->dimension(idx_w);
+ const size_t weights_h = weights->dimension(idx_h);
+ const size_t weights_z = weights->dimension(idx_c);
+ const unsigned int conv_w = output_shape[idx_w];
+ const unsigned int conv_h = output_shape[idx_h];
+ const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
+ const size_t conv_size = conv_w * conv_h;
+
+ TensorShape shape_im2col = input->tensor_shape();
+ shape_im2col.set(0, patch_size);
+ shape_im2col.set(1, conv_size);
+ shape_im2col.set(2, weights_z);
+ TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+
+ const TensorShape shape_weights_reshape(patch_size, weights_z);
+ TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
+
+ DataType v2mm_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+ TensorShape shape_v2mm_out = input->tensor_shape();
+ shape_v2mm_out.set(0, conv_size * weights_z);
+ shape_v2mm_out.set(1, 1);
+ shape_v2mm_out.set(2, 1);
+ TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
+
+ TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
+
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
+ }
+
+ // Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ }
+ }
+ else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
}
-
return Status{};
}
@@ -260,33 +414,48 @@
{
prepare();
- CLScheduler::get().enqueue(_im2col_kernel);
- CLScheduler::get().enqueue(_v2mm_input_fill_border);
- CLScheduler::get().enqueue(_v2mm_kernel);
- CLScheduler::get().enqueue(_vector_to_tensor_kernel);
- if(_is_quantized)
+ if(_optimised_function != nullptr)
{
- CLScheduler::get().enqueue(_output_stage_kernel);
+ _optimised_function->run();
}
- if(_is_activationlayer_enabled)
+ else
{
- _activationlayer_function.run();
+ CLScheduler::get().enqueue(_im2col_kernel);
+ CLScheduler::get().enqueue(_v2mm_input_fill_border);
+ CLScheduler::get().enqueue(_v2mm_kernel);
+ CLScheduler::get().enqueue(_vector_to_tensor_kernel);
+ if(_is_quantized)
+ {
+ CLScheduler::get().enqueue(_output_stage_kernel);
+ }
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
}
}
void CLDepthwiseConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if(_optimised_function != nullptr)
{
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+ _optimised_function->prepare();
+ }
+ else
+ {
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
- // Run weights reshaping and mark original weights tensor as unused
- _weights_reshaped.allocator()->allocate();
- CLScheduler::get().enqueue(_weights_reshape_kernel);
- CLScheduler::get().enqueue(_v2mm_weights_fill_border);
- _original_weights->mark_as_unused();
+ // Run weights reshaping and mark original weights tensor as unused
+ _weights_reshaped.allocator()->allocate();
+ CLScheduler::get().enqueue(_weights_reshape_kernel);
+ CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+ _original_weights->mark_as_unused();
- CLScheduler::get().queue().finish();
- _is_prepared = true;
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
}
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
new file mode 100644
index 0000000..b7e9a68
--- /dev/null
+++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void CLRsqrtLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+ k->configure(input, output, ElementWiseUnary::RSQRT);
+ _kernel = std::move(k);
+}
+Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::RSQRT);
+}
+
+void CLExpLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+ k->configure(input, output, ElementWiseUnary::EXP);
+ _kernel = std::move(k);
+}
+Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::EXP);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
new file mode 100644
index 0000000..28f4b13
--- /dev/null
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "support/ToolchainSupport.h"
+#include <arm_compute/runtime/CL/functions/CLElementwiseOperations.h>
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace
+{
+void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+{
+ if(output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ border_handler.configure(broadcasted_info, border_size, BorderMode::REPLICATE);
+ }
+ }
+}
+} // namespace
+
+void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::ADD, input1, input2, output, policy);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+ return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, input1, input2, output, policy);
+}
+
+void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::SUB, input1, input2, output, policy);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+ ARM_COMPUTE_UNUSED(policy);
+ return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, input1, input2, output, policy);
+}
+
+void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::DIV, input1, input2, output);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::DIV, input1, input2, output);
+}
+
+void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::MAX, input1, input2, output);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
+}
+
+void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::MIN, input1, input2, output);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
+}
+
+void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index baa0cf4..e91038f 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,32 +33,42 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
#include "arm_compute/runtime/ITensorAllocator.h"
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
namespace
{
-inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_interleaved_transposed(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
{
bool flag = true;
if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
{
- // COMPMID-852
- if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+ if((m > 1) && n < 16)
{
- constexpr float alpha = 3.2f;
- constexpr float fact0 = 1.51f;
- constexpr float fact1 = 1.66f;
- constexpr float ops = 12.0f;
- const float scale = k > 1024 ? 1.07f : 1.0f;
- flag = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
+ flag = true;
}
else
{
- flag = false;
+ // COMPMID-852
+ if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+ {
+ constexpr float alpha = 3.2f;
+ constexpr float fact0 = 1.51f;
+ constexpr float fact1 = 1.66f;
+ constexpr float ops = 12.0f;
+ const float scale = k > 1024 ? 1.07f : 1.0f;
+ flag = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
+ }
+ else
+ {
+ flag = false;
+ }
}
}
else
@@ -73,17 +83,19 @@
CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
- _interleave_kernel(),
- _transpose_kernel(),
_mm_kernel(),
_ma_kernel(),
+ _reshape_lhs_kernel(),
+ _reshape_rhs_kernel(),
+ _mm_reshaped_kernel(),
_tmp_a(),
_tmp_b(),
_original_b(nullptr),
_is_interleaved_transposed(false),
_run_addition(false),
_reshape_b_only_on_first_run(false),
- _is_prepared(false)
+ _is_prepared(false),
+ _is_new_gemm_reshaped(false)
{
}
@@ -106,29 +118,52 @@
const GPUTarget gpu_target = CLScheduler::get().target();
// Set the target for the kernels
- _interleave_kernel.set_target(gpu_target);
+ _reshape_lhs_kernel.set_target(gpu_target);
_mm_kernel.set_target(gpu_target);
// Arguments used by GEMMReshapeInfo
// If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
// in order to know how the matrices have been reshaped
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
- const int n = b->info()->dimension(0);
- const int k = a->info()->dimension(0);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- int mult_transpose1xW_width = 1;
- int mult_interleave4x4_height = 1;
+ DataType data_type = a->info()->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+ const unsigned int n = b->info()->dimension(0);
+ const unsigned int k = a->info()->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
{
mult_transpose1xW_width = 4;
mult_interleave4x4_height = 2;
}
+ GEMMRHSMatrixInfo rhs_info;
+ rhs_info.n0 = 16 / b->info()->element_size();
+ rhs_info.k0 = 1;
+ rhs_info.h0 = mult_transpose1xW_width;
+ rhs_info.interleave = false;
+ rhs_info.transpose = false;
+
+ GEMMLHSMatrixInfo lhs_info;
+ lhs_info.m0 = 4;
+ lhs_info.k0 = 4;
+ lhs_info.v0 = mult_interleave4x4_height;
+ lhs_info.interleave = true;
+ lhs_info.transpose = true;
// Check if we need to reshape the matrix A and matrix B
_is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
+ // Check if we can run the new reshaped GEMM
+ const auto workload = static_cast<float>((m * n) / 20.0f);
+ _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32);
+
+ const bool add_matrix_c = (beta != 0.f && c != nullptr);
+ const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+ const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped;
+
// if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
if(_is_interleaved_transposed)
{
@@ -145,19 +180,37 @@
}
// _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
- // Configure interleave kernel
- _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d());
+ if(_is_new_gemm_reshaped)
+ {
+ GEMMLHSMatrixInfo lhs_info;
- // Configure transpose kernel
- _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+
+ _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
+ depth_output_gemm3d, reinterpret_input_as_3d));
+ }
+ else
+ {
+ // Configure interleave kernel
+ _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+ // Configure transpose kernel
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+ }
}
- // Configure and tune matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
- mult_transpose1xW_width, mult_interleave4x4_height,
- depth_output_gemm3d, reinterpret_input_as_3d),
- gemm_info.fp_mixed_precision());
- CLScheduler::get().tune_kernel_static(_mm_kernel);
+ if(!_is_new_gemm_reshaped)
+ {
+ // Configure and tune matrix multiply kernel
+ _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed,
+ GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d),
+ gemm_info.fp_mixed_precision());
+ CLScheduler::get().tune_kernel_static(_mm_kernel);
+ }
if(_is_interleaved_transposed)
{
@@ -170,7 +223,7 @@
}
// Configure matrix addition kernel
- if(beta != 0 && c != nullptr)
+ if(add_matrix_c && !use_fused_add)
{
_ma_kernel.configure(c, output, beta);
_run_addition = true;
@@ -197,13 +250,15 @@
// Arguments used by GEMMReshapeInfo
// If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
// in order to know how the matrices have been reshaped
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const int n = b->dimension(0);
- const int k = a->dimension(0);
- int mult_transpose1xW_width = 1;
- int mult_interleave4x4_height = 1;
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ DataType data_type = a->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
{
@@ -211,9 +266,31 @@
mult_interleave4x4_height = 2;
}
+ GEMMRHSMatrixInfo rhs_info;
+ rhs_info.n0 = 16 / b->element_size();
+ rhs_info.k0 = 1;
+ rhs_info.h0 = mult_transpose1xW_width;
+ rhs_info.interleave = false;
+ rhs_info.transpose = false;
+
+ GEMMLHSMatrixInfo lhs_info;
+ lhs_info.m0 = 4;
+ lhs_info.k0 = 4;
+ lhs_info.v0 = mult_interleave4x4_height;
+ lhs_info.interleave = true;
+ lhs_info.transpose = true;
+
// Check if we need to reshape the matrix A and matrix B
const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
+ // Check if we can run the new reshaped GEMM
+ const auto workload = static_cast<float>((m * n) / 20.0f);
+ const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32);
+
+ const bool add_matrix_c = (beta != 0.f && c != nullptr);
+ const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+ const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped;
+
// if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
if(run_interleave_transpose)
{
@@ -227,19 +304,42 @@
matrix_a_info = &tmp_a_info;
matrix_b_info = &tmp_b_info;
- // Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
+ if(is_new_gemm_reshaped)
+ {
+ GEMMLHSMatrixInfo lhs_info;
- // Validate transpose kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
+ depth_output_gemm3d, reinterpret_input_as_3d)));
+ }
+ else
+ {
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+ // Validate transpose kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+ }
}
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+ if(!is_new_gemm_reshaped)
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta,
+ run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+ }
- if(beta != 0 && c != nullptr)
+ if(add_matrix_c && !use_fused_add)
{
// Validate matrix addition kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
@@ -257,17 +357,24 @@
if(_is_interleaved_transposed)
{
// Run interleave kernel
- CLScheduler::get().enqueue(_interleave_kernel, false);
+ CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
if(!_reshape_b_only_on_first_run)
{
// Run transpose kernel
- CLScheduler::get().enqueue(_transpose_kernel, false);
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
}
}
// Run matrix multiply kernel
- CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+ if(_is_new_gemm_reshaped)
+ {
+ CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+ }
// Run matrix addition kernel
if(_run_addition)
@@ -286,10 +393,11 @@
{
// Run transpose kernel and mark original weights tensor as unused
_tmp_b.allocator()->allocate();
- CLScheduler::get().enqueue(_transpose_kernel, false);
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
_original_b->mark_as_unused();
}
CLScheduler::get().queue().finish();
_is_prepared = true;
}
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 4694aa7..7105e85 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -93,7 +93,7 @@
CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
_original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false), _skip_col2im(false), _is_quantized(false),
- _is_activationlayer_enabled(false), _is_prepared(false)
+ _is_activationlayer_enabled(false), _is_prepared(false), _run_addition(true)
{
}
@@ -101,7 +101,8 @@
int gemm_3d_depth)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
- ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col,
+ _run_addition));
const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
@@ -125,13 +126,15 @@
}
else
{
+ // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
+ const bool skip_bias_in_gemm = _run_addition || !_skip_im2col;
// Configure matrix multiply function
- _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+ _mm_gemm.configure(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
}
}
Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col)
+ const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, bool run_addition)
{
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
@@ -156,8 +159,10 @@
}
else
{
+ // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
+ const bool skip_bias_in_gemm = run_addition || !skip_im2col;
// Perform validation step on Matrix multiply function
- return CLGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+ return CLGEMM::validate(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
}
}
@@ -193,6 +198,8 @@
_skip_col2im = data_layout == DataLayout::NHWC;
_append_bias = (biases != nullptr) && (!_is_quantized);
_is_activationlayer_enabled = act_info.enabled();
+ // In case of F16, fused bias will be used in GEMM
+ _run_addition = (_skip_im2col) && (_append_bias) && (data_type != DataType::F16);
// Set the GPU target for im2col and col2im
_im2col_kernel.set_target(CLScheduler::get().target());
@@ -242,7 +249,7 @@
else if(_append_bias)
{
// Configure add bias kernel
- _add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
+ _add_bias_kernel.configure(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE);
}
// Create GEMM output tensor
@@ -276,9 +283,9 @@
{
const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
- const float multiplier = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
- int output_multiplier = 0;
- int output_shift = 0;
+ const float multiplier = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
int min_activation = 0;
@@ -375,6 +382,8 @@
const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
const bool skip_col2im = data_layout == DataLayout::NHWC;
bool is_activationlayer_enabled = act_info.enabled();
+ // In case of F16, fused bias will be used in GEMM
+ const bool run_addition = (skip_im2col) && (append_bias) && (data_type != DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -429,10 +438,10 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups));
gemm_input_to_use = &im2col_reshaped_info;
}
- else if(append_bias)
+ else if(run_addition)
{
// Validate add bias kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE));
}
// Create GEMM output tensor
@@ -459,9 +468,9 @@
{
const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input->quantization_info() : output->quantization_info();
- const float multiplier = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
- int output_multiplier = 0;
- int output_shift = 0;
+ const float multiplier = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
@@ -496,7 +505,7 @@
// In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, run_addition));
// Validate Col2Im
if(!skip_col2im)
@@ -537,7 +546,7 @@
_mm_gemm.run();
}
- if(_skip_im2col && _append_bias)
+ if(_run_addition)
{
CLScheduler::get().enqueue(_add_bias_kernel);
}
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 2d4d231..2a01db7 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,42 +31,25 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
namespace
{
-inline bool is_interleaved_transposed(int m, int n, int k, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_gemm_reshaped(unsigned int m, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
{
- bool flag = true;
-
- if(gpu_target_is_in(gpu_target,
- GPUTarget::G71, GPUTarget::G72,
- GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT))
- {
- // COMPMID-852
- if(k > 256 && m > 4 && reshape_b_only_on_first_run)
- {
- flag = ((0.72f + n * 0.10766f) < (n * 0.1284f));
- }
- else
- {
- flag = false;
- }
- }
- else
- {
- flag = m > 1;
- }
-
- return flag;
+ return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (m > 1) && (reshape_b_only_on_first_run);
}
} // namespace
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
_mm_kernel(),
+ _mm_reshaped_kernel(),
_mtx_a_reshape_kernel(),
_mtx_b_reshape_kernel(),
_mtx_a_reduction_kernel(),
@@ -81,7 +64,7 @@
_original_b(nullptr),
_a_offset(0),
_b_offset(0),
- _is_interleaved_transposed(true),
+ _is_gemm_reshaped(true),
_reshape_b_only_on_first_run(false),
_is_prepared(false),
_fuse_output_stage(false)
@@ -108,23 +91,23 @@
const ICLTensor *matrix_a = a;
const ICLTensor *matrix_b = b;
+ GEMMRHSMatrixInfo rhs_info;
+ GEMMLHSMatrixInfo lhs_info;
// Arguments used by GEMMReshapeInfo
// If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
// in order to know how the matrices have been reshaped
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const bool unroll_block = dot8_supported(CLKernelLibrary::get().get_device());
- const int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
- const int n = b->info()->dimension(0);
- const int k = a->info()->dimension(0);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- constexpr int mult_transpose1xW_width = 1;
- constexpr int mult_interleave4x4_height = 1;
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+ const unsigned int n = b->info()->dimension(0);
+ const unsigned int k = a->info()->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
// Check if we need to reshape the matrix A and matrix B
- _is_interleaved_transposed = is_interleaved_transposed(m, n, k, _reshape_b_only_on_first_run, gpu_target);
+ _is_gemm_reshaped = is_gemm_reshaped(m, _reshape_b_only_on_first_run, gpu_target);
- if(_is_interleaved_transposed)
+ if(_is_gemm_reshaped)
{
// if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
reinterpret_input_as_3d = false;
@@ -138,11 +121,14 @@
_memory_group.manage(&_tmp_b);
}
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+
// Configure interleave kernel
- _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d(), unroll_block);
+ _mtx_a_reshape_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
// Configure transpose kernel
- _mtx_b_reshape_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
+ _mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info);
}
// Initialize matrix B reduction kernel only if _a_offset is not equal to 0
@@ -177,10 +163,16 @@
_memory_group.manage(&_mm_result_s32);
- // Configure matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
- mult_transpose1xW_width, mult_interleave4x4_height,
- depth_output_gemm3d, reinterpret_input_as_3d));
+ if(_is_gemm_reshaped)
+ {
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
// Configure offset contribution kernel
_offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
@@ -190,17 +182,23 @@
}
else
{
- // Configure matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
- mult_transpose1xW_width, mult_interleave4x4_height,
- depth_output_gemm3d, reinterpret_input_as_3d));
+ if(_is_gemm_reshaped)
+ {
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(matrix_a, matrix_b, output, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
// Configure offset contribution kernel
_offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset);
}
// Allocate tensors
- if(_is_interleaved_transposed)
+ if(_is_gemm_reshaped)
{
_tmp_a.allocator()->allocate();
if(!_reshape_b_only_on_first_run)
@@ -233,18 +231,19 @@
const ITensorInfo *matrix_a_info = a;
const ITensorInfo *matrix_b_info = b;
- TensorInfo tmp_a_info{};
- TensorInfo tmp_b_info{};
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+ GEMMRHSMatrixInfo rhs_info;
+ GEMMLHSMatrixInfo lhs_info;
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const int n = b->dimension(0);
- const int k = a->dimension(0);
- constexpr int mult_transpose1xW_width = 1;
- constexpr int mult_interleave4x4_height = 1;
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- bool reshape_matrices = is_interleaved_transposed(m, n, k, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
+ bool reshape_matrices = is_gemm_reshaped(m, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
// if reshape_matrices is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
if(reshape_matrices)
@@ -252,20 +251,24 @@
reinterpret_input_as_3d = false;
}
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
if(reshape_matrices)
{
matrix_a_info = &tmp_a_info;
matrix_b_info = &tmp_b_info;
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+
// Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
// Validate transpose kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
}
TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -292,12 +295,22 @@
{
TensorInfo mm_result_s32_info{};
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_matrices, reshape_info)).set_data_type(DataType::S32));
+ if(reshape_matrices)
+ {
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_matrices, reshape_info));
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+ }
+ else
+ {
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, false, reshape_info));
+ }
// Validate offset contribution kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -309,9 +322,16 @@
}
else
{
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_matrices, reshape_info));
-
+ if(reshape_matrices)
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+ }
+ else
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info));
+ }
// Validate offset contribution kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -329,7 +349,7 @@
_memory_group.acquire();
- if(_is_interleaved_transposed)
+ if(_is_gemm_reshaped)
{
// Run reshape matrix A
CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
@@ -348,7 +368,14 @@
}
// Run matrix multiply
- CLScheduler::get().enqueue(_mm_kernel, false);
+ if(_is_gemm_reshaped)
+ {
+ CLScheduler::get().enqueue(_mm_reshaped_kernel, false);
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_mm_kernel, false);
+ }
// Run matrix A reduction kernel only if _b_offset is not equal to 0
if(_b_offset != 0)
@@ -374,7 +401,7 @@
{
if(!_is_prepared)
{
- if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+ if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
{
ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
@@ -395,3 +422,4 @@
_is_prepared = true;
}
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
new file mode 100644
index 0000000..459438e
--- /dev/null
+++ b/src/runtime/CL/functions/CLGather.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGather.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>();
+ k->configure(input, indices, output, axis);
+ _kernel = std::move(k);
+}
+
+Status CLGather::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+ return CLGatherKernel::validate(input, indices, output, axis);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 5dd1202..c50132e 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,6 +40,7 @@
_memset_kernel(),
_padded_copy_kernel(),
_cpp_nms_kernel(),
+ _is_nhwc(false),
_deltas_permuted(),
_deltas_flattened(),
_scores_permuted(),
@@ -60,10 +61,11 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+ _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC;
const DataType data_type = deltas->info()->data_type();
- const int num_anchors = scores->info()->dimension(2);
- const int feat_width = scores->info()->dimension(0);
- const int feat_height = scores->info()->dimension(1);
+ const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
const int total_num_anchors = num_anchors * feat_width * feat_height;
const int pre_nms_topN = info.pre_nms_topN();
const int post_nms_topN = info.post_nms_topN();
@@ -77,21 +79,37 @@
_deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, data_type));
// Permute and reshape deltas
- _memory_group.manage(&_deltas_permuted);
- _memory_group.manage(&_deltas_flattened);
- _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
- _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
- _deltas_permuted.allocator()->allocate();
+ if(!_is_nhwc)
+ {
+ _memory_group.manage(&_deltas_permuted);
+ _memory_group.manage(&_deltas_flattened);
+ _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+ _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+ _deltas_permuted.allocator()->allocate();
+ }
+ else
+ {
+ _memory_group.manage(&_deltas_flattened);
+ _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
+ }
const TensorShape flatten_shape_scores(1, total_num_anchors);
_scores_flattened.allocator()->init(TensorInfo(flatten_shape_scores, 1, data_type));
// Permute and reshape scores
- _memory_group.manage(&_scores_permuted);
- _memory_group.manage(&_scores_flattened);
- _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
- _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
- _scores_permuted.allocator()->allocate();
+ if(!_is_nhwc)
+ {
+ _memory_group.manage(&_scores_permuted);
+ _memory_group.manage(&_scores_flattened);
+ _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+ _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+ _scores_permuted.allocator()->allocate();
+ }
+ else
+ {
+ _memory_group.manage(&_scores_flattened);
+ _flatten_scores_kernel.configure(scores, &_scores_flattened);
+ }
// Bounding box transform
_memory_group.manage(&_all_proposals);
@@ -141,11 +159,12 @@
const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW, DataLayout::NHWC);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
- const int num_anchors = scores->dimension(2);
- const int feat_width = scores->dimension(0);
- const int feat_height = scores->dimension(1);
+ const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
const int num_images = scores->dimension(3);
const int total_num_anchors = num_anchors * feat_width * feat_height;
const int values_per_roi = info.values_per_roi();
@@ -156,14 +175,21 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+ TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+ if(scores->data_layout() == DataLayout::NHWC)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+ }
TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
- TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
-
TensorInfo scores_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
@@ -236,9 +262,12 @@
CLScheduler::get().enqueue(_compute_anchors_kernel, false);
// Transpose and reshape the inputs
- CLScheduler::get().enqueue(_permute_deltas_kernel, false);
+ if(!_is_nhwc)
+ {
+ CLScheduler::get().enqueue(_permute_deltas_kernel, false);
+ CLScheduler::get().enqueue(_permute_scores_kernel, false);
+ }
CLScheduler::get().enqueue(_flatten_deltas_kernel, false);
- CLScheduler::get().enqueue(_permute_scores_kernel, false);
CLScheduler::get().enqueue(_flatten_scores_kernel, false);
// Build the boxes
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 4f709d5..2e3c6d7 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -32,8 +32,8 @@
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
{
@@ -81,3 +81,4 @@
_memory_group.release();
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index a89c4e3..f01b1b8 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -110,9 +110,9 @@
_gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
_forget_gate_out2.allocator()->allocate();
_memory_group.manage(&_forget_gate_out5);
- _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+ _accum_forget_gate1.configure(ArithmeticOperation::ADD, &_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+ _forget_gate_out1.allocator()->allocate();
CLTensor *forget_gate_out = &_forget_gate_out5;
-
if(lstm_params.has_peephole_opt())
{
_forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -129,17 +129,18 @@
{
_forget_gate_out3.allocator()->allocate();
}
- _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
// input_gate = 1 - forget_gate, with CIFG
_input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ CLTensor *input_gate_out = &_input_gate_out1;
if(lstm_params.has_cifg_opt())
{
_memory_group.manage(&_input_gate_out1);
_ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
_ones.allocator()->allocate();
_run_cifg_opt = true;
}
@@ -160,17 +161,23 @@
_gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
_input_gate_out2.allocator()->allocate();
_memory_group.manage(&_input_gate_out4);
- _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+ _accum_input_gate1.configure(ArithmeticOperation::ADD, &_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+ _input_gate_out3.allocator()->allocate();
+ input_gate_out = &_input_gate_out4;
if(_run_peephole_opt)
{
_memory_group.manage(&_input_gate_out5);
_pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
_accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _input_gate_out4.allocator()->allocate();
_input_gate_out5.allocator()->allocate();
+ input_gate_out = &_input_gate_out1;
}
- _input_gate_out3.allocator()->allocate();
- _input_gate_out4.allocator()->allocate();
- _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ else
+ {
+ _input_gate_out1.allocator()->allocate();
+ }
+ _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
}
// Configure block that calculates the cell state
@@ -190,14 +197,13 @@
_gemm_cell_state1.configure(output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
_cell_state_out2.allocator()->allocate();
_memory_group.manage(&_cell_state_out4);
- _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+ _accum_cell_state1.configure(ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
_activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
_memory_group.manage(&_cell_state_out5);
- _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_cell_state1.configure(&_cell_state_out4, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
_cell_state_out4.allocator()->allocate();
- _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _forget_gate_out1.allocator()->allocate();
- _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+ _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_cell_state2.configure(ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
_cell_state_out3.allocator()->allocate();
_cell_state_out5.allocator()->allocate();
// Perform clipping
@@ -223,7 +229,7 @@
_gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
_output2.allocator()->allocate();
_memory_group.manage(&_output5);
- _accum_output1.configure(&_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
+ _accum_output1.configure(ArithmeticOperation::ADD, &_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
_output3.allocator()->allocate();
CLTensor *output_gate_out = &_output5;
if(lstm_params.has_peephole_opt())
@@ -284,13 +290,13 @@
std::vector<ICLTensor *> scratch_inputs;
if(!lstm_params.has_cifg_opt())
{
- scratch_inputs.emplace_back(&_input_gate_out1);
+ scratch_inputs.emplace_back(input_gate_out);
}
scratch_inputs.emplace_back(&_cell_state_out1);
scratch_inputs.emplace_back(forget_gate_out);
scratch_inputs.emplace_back(output_gate_out);
_concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
- _input_gate_out1.allocator()->allocate();
+ input_gate_out->allocator()->allocate();
_cell_state_out1.allocator()->allocate();
forget_gate_out->allocator()->allocate();
output_gate_out->allocator()->allocate();
@@ -364,7 +370,7 @@
// Validate forget gate
ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
@@ -396,7 +402,7 @@
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtractionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
}
// Validate cell state
@@ -544,4 +550,4 @@
_concat_scratch_buffer.run();
_memory_group.release();
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index 7e5278f..559b57f 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,8 +28,8 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
#include "support/ToolchainSupport.h"
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 32d8f15..8489fab 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@
_norm_kernel.configure(input, output, norm_info);
// Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
- _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
}
Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index de43c7d..3aa1b1e 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,21 +34,21 @@
{
}
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding)
+void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value)
{
// Copy the input to the output
_copy_kernel.configure(input, output, padding);
// Set the pages of the output to zero
- _memset_kernel.configure(output, PixelValue());
+ _memset_kernel.configure(output, constant_value);
// Fill padding on the first two dimensions with zeros
- _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT);
+ _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT, constant_value);
}
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding)
+Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, PixelValue()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, constant_value));
ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, padding));
return Status{};
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 1809e6e..63f00ac 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -60,7 +60,7 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
return Status{};
@@ -90,7 +90,7 @@
_add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
_memory_group.manage(&_add_output);
- _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+ _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
_fully_connected_out.allocator()->allocate();
_gemm_output.allocator()->allocate();
@@ -127,4 +127,4 @@
_is_prepared = true;
}
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index 0f480ee..7bb4178 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,7 +30,7 @@
using namespace arm_compute;
-void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
{
// Configure ROI pooling kernel
auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>();
diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
new file mode 100644
index 0000000..b2cd472
--- /dev/null
+++ b/src/runtime/CL/functions/CLRange.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRange.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLRange::configure(ICLTensor *output, const float start, const float end, const float step)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLRangeKernel>();
+ k->set_target(CLScheduler::get().target());
+ k->configure(output, start, end, step);
+ _kernel = std::move(k);
+
+ // Tune kernels
+ CLScheduler::get().tune_kernel_static(*_kernel);
+}
+
+Status CLRange::validate(const ITensorInfo *output, const float start, const float end, const float step)
+{
+ return CLRangeKernel::validate(output, start, end, step);
+}
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index 1016ff7..b2d0f81 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,22 +45,31 @@
_reduced_outs = arm_compute::support::cpp14::make_unique<CLTensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
_keep_dims = keep_dims;
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
+
+ // Convert negative axis
+ for(unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
// Perform reduction for every axis
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
- out_shape.set(reduction_axis[i], 1);
+ out_shape.set(axis_local[i], 1);
auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
if(i == _reduction_ops - 1 && keep_dims)
{
- _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
}
else
{
_reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
_memory_group.manage(_reduced_outs.get() + i);
- _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
}
}
@@ -77,11 +86,10 @@
// We have to sort the reduction axis vectors in order for remove_dimension
// to work properly
- Coordinates axis_copy = reduction_axis;
- std::sort(axis_copy.begin(), axis_copy.begin() + _reduction_ops);
+ std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
- out_shape.remove_dimension(axis_copy[i] - i);
+ out_shape.remove_dimension(axis_local[i] - i);
}
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
_reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
@@ -90,22 +98,43 @@
Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
{
- ARM_COMPUTE_UNUSED(keep_dims);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
- for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i)
+ TensorShape out_shape = input->tensor_shape();
+
+ Coordinates axis_sorted = reduction_axis;
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ const int input_dims = input->num_dimensions();
+
+ // Convert negative axis
+ for(unsigned int i = 0; i < reduction_ops; ++i)
{
- ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis[i] > 3);
- ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(reduction_axis[i]) > input->num_dimensions() - 1);
+ axis_sorted[i] = wrap_around(axis_sorted[i], input_dims);
+ }
+
+ std::sort(axis_sorted.begin(), axis_sorted.begin() + reduction_ops);
+ for(unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_sorted[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_sorted[i]) > input->num_dimensions() - 1);
if(output->total_size() > 0 && keep_dims)
{
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_sorted[i]) != 1);
}
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM));
+ if(keep_dims)
+ {
+ out_shape.set(axis_sorted[i], 1);
+ }
+ else
+ {
+ out_shape.remove_dimension(axis_sorted[i] - i);
+ }
}
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
return Status{};
}
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index c5447ff..3d82e3f 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,15 +56,19 @@
} // namespace
CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_quantized()
+ : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
{
}
Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
-
- if(axis == 0 && !is_data_type_quantized(input->data_type()))
+ bool is_serial = is_data_type_quantized(input->data_type()) || axis != 0;
+ if(is_serial)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
+ }
+ else
{
// Create temporary tensor infos
auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
@@ -81,17 +85,25 @@
}
ReductionOperation first_kernel_op;
+ ReductionOperation intermediate_kernel_op;
ReductionOperation last_kernel_op;
switch(op)
{
case ReductionOperation::SUM:
case ReductionOperation::MEAN_SUM:
- first_kernel_op = ReductionOperation::SUM;
- last_kernel_op = op;
+ first_kernel_op = ReductionOperation::SUM;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = op;
break;
case ReductionOperation::SUM_SQUARE:
- first_kernel_op = ReductionOperation::SUM_SQUARE;
- last_kernel_op = ReductionOperation::SUM;
+ first_kernel_op = ReductionOperation::SUM_SQUARE;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = ReductionOperation::SUM;
+ break;
+ case ReductionOperation::PROD:
+ first_kernel_op = ReductionOperation::PROD;
+ intermediate_kernel_op = ReductionOperation::PROD;
+ last_kernel_op = ReductionOperation::PROD;
break;
default:
ARM_COMPUTE_ERROR("Not supported");
@@ -103,17 +115,13 @@
// Validate ReductionOperation on intermediate stages
for(unsigned int i = 1; i < num_of_stages - 1; ++i)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, ReductionOperation::SUM));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op));
}
// Validate ReductionOperation on the last stage
const unsigned int last_stage = num_of_stages - 1;
ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
}
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
- }
return Status{};
}
@@ -122,65 +130,77 @@
{
_num_of_stages = calculate_number_of_stages(input->info(), axis);
_reduction_axis = axis;
- _is_quantized = is_data_type_quantized(input->info()->data_type());
+ _is_serial = is_data_type_quantized(input->info()->data_type()) || axis != 0;
// Configure reduction operation kernels
_reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
// Create temporary tensors
- if(axis == 0 && !_is_quantized)
+ if(_is_serial)
+ {
+ _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+ }
+ else
{
_border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
- _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+ _results_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
TensorShape shape{ input->info()->tensor_shape() };
for(unsigned int i = 0; i < _num_of_stages - 1; i++)
{
shape.set(0, ceil(shape.x() / 128.f));
- _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+ _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
}
// Apply ReductionOperation only on first kernel
- _memory_group.manage(_sums_vector.get());
+ _memory_group.manage(_results_vector.get());
ReductionOperation first_kernel_op;
+ ReductionOperation intermediate_kernel_op;
ReductionOperation last_kernel_op;
+ PixelValue pixelValue;
switch(op)
{
case ReductionOperation::SUM:
case ReductionOperation::MEAN_SUM:
- first_kernel_op = ReductionOperation::SUM;
- last_kernel_op = op;
+ first_kernel_op = ReductionOperation::SUM;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = op;
+ pixelValue = PixelValue();
break;
case ReductionOperation::SUM_SQUARE:
- first_kernel_op = ReductionOperation::SUM_SQUARE;
- last_kernel_op = ReductionOperation::SUM;
+ first_kernel_op = ReductionOperation::SUM_SQUARE;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = ReductionOperation::SUM;
+ pixelValue = PixelValue();
+ break;
+ case ReductionOperation::PROD:
+ first_kernel_op = ReductionOperation::PROD;
+ intermediate_kernel_op = ReductionOperation::PROD;
+ last_kernel_op = ReductionOperation::PROD;
+ pixelValue = PixelValue(1, input->info()->data_type());
break;
default:
ARM_COMPUTE_ERROR("Not supported");
}
- _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, first_kernel_op);
- _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op);
+ _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
// Apply ReductionOperation on intermediate stages
for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
{
- _memory_group.manage(_sums_vector.get() + i);
- _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
- _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
- _sums_vector[i - 1].allocator()->allocate();
+ _memory_group.manage(_results_vector.get() + i);
+ _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op);
+ _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+ _results_vector[i - 1].allocator()->allocate();
}
// Apply ReductionOperation on the last stage
const unsigned int last_stage = _num_of_stages - 1;
const unsigned int input_width = input->info()->dimension(0);
- _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
- _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
- _sums_vector[last_stage - 1].allocator()->allocate();
- }
- else
- {
- _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+ _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
+ _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+ _results_vector[last_stage - 1].allocator()->allocate();
}
}
@@ -188,7 +208,11 @@
{
_memory_group.acquire();
- if(_reduction_axis == 0 && !_is_quantized)
+ if(_is_serial)
+ {
+ CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
+ }
+ else
{
for(unsigned int i = 0; i < _num_of_stages; ++i)
{
@@ -196,10 +220,6 @@
CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
}
}
- else
- {
- CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
- }
_memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
new file mode 100644
index 0000000..0f86b9f
--- /dev/null
+++ b/src/runtime/CL/functions/CLReverse.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReverse.h"
+
+#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLReverseKernel>();
+ k->configure(input, output, axis);
+ _kernel = std::move(k);
+}
+
+Status CLReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+ return CLReverseKernel::validate(input, output, axis);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
new file mode 100644
index 0000000..90c368e
--- /dev/null
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSelect.h"
+
+#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSelectKernel>();
+ k->configure(c, x, y, output);
+ _kernel = std::move(k);
+}
+
+Status CLSelect::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+ return CLSelectKernel::validate(c, x, y, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index bef7eca..f630853 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -36,10 +36,10 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
// Get absolute end coordinates
- const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->info()->tensor_shape(), ends);
+ const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
- k->configure(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+ k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
_kernel = std::move(k);
}
@@ -54,8 +54,8 @@
}));
// Get absolute end coordinates
- const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->tensor_shape(), ends);
+ const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
- return CLStridedSliceKernel::validate(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+ return CLStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 76c1e18..a24b72e 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -33,20 +33,19 @@
namespace arm_compute
{
CLSpaceToBatchLayer::CLSpaceToBatchLayer()
- : _space_to_batch_kernel(), _output(nullptr), _has_padding(false)
+ : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
{
}
void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
+ _memset_kernel.configure(output, PixelValue());
}
-
- _output = output;
_space_to_batch_kernel.configure(input, block_shape, paddings, output);
}
@@ -57,42 +56,35 @@
if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
+ _memset_kernel.configure(output, PixelValue());
}
-
- _output = output;
_space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
}
Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
{
- return CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+ return Status{};
}
Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
const ITensorInfo *output)
{
- return CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+ return Status{};
}
void CLSpaceToBatchLayer::run()
{
// Zero out output only if we have paddings
- // TODO(micspy01): replace with memset once ready
if(_has_padding)
{
- _output->map(CLScheduler::get().queue(), true);
- if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
- {
- const uint8_t quantized_zero = _output->info()->quantization_info().offset;
- std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
- }
- else
- {
- memset(_output->buffer(), 0, _output->info()->total_size());
- }
- _output->unmap(CLScheduler::get().queue());
+ CLScheduler::get().enqueue(_memset_kernel, true);
}
-
CLScheduler::get().enqueue(_space_to_batch_kernel, true);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
new file mode 100644
index 0000000..71327fe
--- /dev/null
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <complex>
+
+#include "arm_compute/runtime/CL/functions/CLStackLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLStackLayer::CLStackLayer() // NOLINT
+ : _input(),
+ _stack_kernels(),
+ _num_inputs(0)
+{
+}
+
+void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+{
+ _num_inputs = input.size();
+ _stack_kernels = arm_compute::support::cpp14::make_unique<CLStackLayerKernel[]>(_num_inputs);
+
+ // Wrap around negative values
+ const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
+
+ for(unsigned int i = 0; i < _num_inputs; i++)
+ {
+ _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
+ }
+}
+
+Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input.empty());
+
+ // Wrap around negative values
+ const size_t rank = input[0]->num_dimensions();
+ const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
+
+ const unsigned int num_inputs = input.size();
+
+ for(unsigned int i = 0; i < num_inputs; i++)
+ {
+ // All the tensors must have the same rank
+ ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
+ // Validate Kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
+ }
+
+ return Status{};
+}
+
+void CLStackLayer::run()
+{
+ for(unsigned i = 0; i < _num_inputs; i++)
+ {
+ CLScheduler::get().enqueue(_stack_kernels[i], false);
+ }
+}
diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
new file mode 100644
index 0000000..ec6a4ab
--- /dev/null
+++ b/src/runtime/CL/functions/CLTile.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTile.h"
+
+#include "arm_compute/core/CL/kernels/CLTileKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLTileKernel>();
+ k->configure(input, output, multiples);
+ _kernel = std::move(k);
+}
+
+Status CLTile::validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+ return CLTileKernel::validate(input, output, multiples);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
new file mode 100644
index 0000000..428d091
--- /dev/null
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLUnstack.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
+{
+ return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
+}
+
+inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+{
+ // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
+ Coordinates slice_end;
+ slice_start.set_num_dimensions(input_num_dimensions);
+ slice_end.set_num_dimensions(input_num_dimensions);
+ for(size_t k = 0; k < input_num_dimensions; ++k)
+ {
+ slice_start.set(k, 0);
+ slice_end.set(k, -1);
+ }
+ slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(slice_end);
+}
+} // namespace
+
+CLUnstack::CLUnstack() // NOLINT
+ : _num_slices(0),
+ _strided_slice_vector()
+{
+}
+
+void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+{
+ std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
+ std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+ return t->info();
+ });
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis));
+
+ // Wrap around negative values
+ const unsigned int axis_u = wrap_axis(axis, input->info());
+ _num_slices = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
+ _strided_slice_vector = arm_compute::support::cpp14::make_unique<CLStridedSlice[]>(_num_slices);
+
+ Coordinates slice_start;
+ int32_t slice_end_mask;
+ setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
+ for(unsigned int slice = 0; slice < _num_slices; ++slice)
+ {
+ // Adjusts start and end coordinates to take a 2D slice at a time
+ slice_start.set(axis_u, slice);
+ _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+ }
+}
+
+Status CLUnstack::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &output_vector, int axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_vector.empty());
+ ARM_COMPUTE_RETURN_ERROR_ON(axis < (-static_cast<int>(input->tensor_shape().num_dimensions())));
+ ARM_COMPUTE_RETURN_ERROR_ON(axis >= static_cast<int>(input->tensor_shape().num_dimensions()));
+ const unsigned int num_slices = std::min(output_vector.size(), input->dimension(wrap_axis(axis, input)));
+ ARM_COMPUTE_RETURN_ERROR_ON(num_slices > input->dimension(wrap_axis(axis, input)));
+ ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
+ Coordinates slice_start;
+ int32_t slice_end_mask;
+ for(size_t k = 0; k < num_slices; ++k)
+ {
+ slice_start.set(wrap_axis(axis, input), k);
+ setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
+ ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+ }
+ return Status{};
+}
+
+void CLUnstack::run()
+{
+ for(unsigned i = 0; i < _num_slices; ++i)
+ {
+ _strided_slice_vector[i].run();
+ }
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index 46a2d80..d0801a6 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,8 +50,8 @@
ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
// Output auto inizialitation if not yet initialized
- TensorInfo tmp_output_info = *output->clone();
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ TensorInfo tmp_output_info = *output->clone();
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
switch(num_inputs)
@@ -90,7 +90,7 @@
{
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
}
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 1abcb67..069196e 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -84,8 +84,8 @@
} // namespace
CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
- _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false)
+ : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _input0(), _input1(), _batched_mm_output(), _original_weights(nullptr),
+ _is_prepared(false)
{
}
@@ -133,14 +133,7 @@
(input->info()->data_type() == DataType::F16)));
// Configure output transform
- _output_transform.configure(&_batched_mm_output, biases, output, winograd_info);
-
- // Configure activation layer
- _is_activationlayer_enabled = act_info.enabled();
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(output, nullptr, act_info);
- }
+ _output_transform.configure(&_batched_mm_output, biases, output, winograd_info, act_info);
// Allocate temporary tensors
_input0.allocator()->allocate();
@@ -216,11 +209,6 @@
// Run output transform
CLScheduler::get().enqueue(_output_transform);
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.run();
- }
-
_memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
index 09e8456..7361eb2 100644
--- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,7 +35,7 @@
auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
k->configure(input, output, winograd_info);
_kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
}
Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)